From 74604d6170b36a6f5352c3b2f7e4d5e9e52c3d7a Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Fri, 6 Feb 2026 13:38:12 +0100
Subject: [PATCH 01/33] Add CUDACPP files as PLUGIN

---
 PLUGIN/CUDACPP_OUTPUT/AUTHORS                 |    1 +
 PLUGIN/CUDACPP_OUTPUT/CHANGELOG.md            |   67 +
 PLUGIN/CUDACPP_OUTPUT/COPYING                 |    1 +
 PLUGIN/CUDACPP_OUTPUT/COPYING.LESSER          |    1 +
 PLUGIN/CUDACPP_OUTPUT/COPYRIGHT               |    1 +
 PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.cc          | 6258 ++++++++++++++
 PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.h           | 3426 ++++++++
 .../MadtRex/makefiles/cudacpp_driver.mk       |  878 ++
 .../MadtRex/makefiles/cudacpp_rex_src.mk      |  205 +
 .../MadtRex/makefiles/cudacpp_runner.mk       |  891 ++
 PLUGIN/CUDACPP_OUTPUT/MadtRex/rex.mk          |   26 +
 PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.cc       |  939 +++
 PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.h        |  283 +
 .../MadtRex/template_files/rwgt_driver.inc    |  200 +
 .../MadtRex/template_files/rwgt_instance.cc   |  176 +
 .../MadtRex/template_files/rwgt_instance.h    |   74 +
 .../MadtRex/template_files/rwgt_runner_cc.inc |   64 +
 .../MadtRex/template_files/rwgt_runner_h.inc  |   31 +
 PLUGIN/CUDACPP_OUTPUT/__init__.py             |   81 +
 .../create_acceptance_from_file.py            |  132 +
 .../acceptance_tests/madgraph_launch.template |   29 +
 .../simple_cross_check/simd_cpp_eemumua_float |   10 +
 .../simd_cpp_heft_ggh_double                  |   10 +
 .../simple_cross_check/simd_cpp_pptt_mixed    |   10 +
 .../simple_cross_check/simd_cpp_vector_size   |   17 +
 .../acceptance_tests/test_simd_madevent.py    |  350 +
 .../test_simd_madevent.template               |  115 +
 .../aloha/template_files/gpu/helas.cu         |   11 +
 .../aloha/template_files/gpu/helas.h          |  910 +++
 PLUGIN/CUDACPP_OUTPUT/launch_plugin.py        |  140 +
 .../iolibs/template_files/.clang-format       |  229 +
 .../madgraph/iolibs/template_files/AUTHORS    |   34 +
 .../template_files/CMake/CMakeLists.txt       |   19 +
 .../iolibs/template_files/CMake/Compilers.txt |    7 +
 .../iolibs/template_files/CMake/Macros.txt    |   15 +
 .../iolibs/template_files/CMake/Platforms.txt |    8 +
 .../CMake/SubProcesses/CMakeLists.txt         |    9 +
 .../CMake/SubProcesses/CMakeLists_P.txt       |   29 +
 .../template_files/CMake/src/CMakeLists.txt   |   10 +
 .../madgraph/iolibs/template_files/COPYING    |  674 ++
 .../iolibs/template_files/COPYING.LESSER      |  165 +
 .../madgraph/iolibs/template_files/COPYRIGHT  |   58 +
 .../cpp_model_parameters_cc.inc               |  100 +
 .../template_files/cpp_model_parameters_h.inc |  260 +
 .../iolibs/template_files/gpu/Bridge.h        |  633 ++
 .../template_files/gpu/BridgeKernels.cc       |  163 +
 .../iolibs/template_files/gpu/BridgeKernels.h |  145 +
 .../gpu/CommonRandomNumberKernel.cc           |   38 +
 .../template_files/gpu/CommonRandomNumbers.h  |   96 +
 .../template_files/gpu/CrossSectionKernels.cc |  237 +
 .../template_files/gpu/CrossSectionKernels.h  |  138 +
 .../gpu/CurandRandomNumberKernel.cc           |  135 +
 .../template_files/gpu/EventStatistics.h      |  174 +
 .../template_files/gpu/GpuAbstraction.h       |  165 +
 .../iolibs/template_files/gpu/GpuRuntime.h    |  101 +
 .../gpu/HiprandRandomNumberKernel.cc          |  145 +
 .../iolibs/template_files/gpu/MadgraphTest.h  |  340 +
 .../gpu/MatrixElementKernels.cc               |  537 ++
 .../template_files/gpu/MatrixElementKernels.h |  253 +
 .../gpu/MemoryAccessAmplitudes.h              |  164 +
 .../gpu/MemoryAccessChannelIds.h              |  125 +
 .../gpu/MemoryAccessCouplings.h               |  270 +
 .../gpu/MemoryAccessCouplingsFixed.h          |   84 +
 .../gpu/MemoryAccessDenominators.h            |   32 +
 .../template_files/gpu/MemoryAccessGs.h       |  170 +
 .../template_files/gpu/MemoryAccessHelpers.h  |  157 +
 .../gpu/MemoryAccessIflavorVec.h              |  124 +
 .../gpu/MemoryAccessMatrixElements.h          |  146 +
 .../template_files/gpu/MemoryAccessMomenta.h  |  275 +
 .../gpu/MemoryAccessNumerators.h              |   32 +
 .../gpu/MemoryAccessRandomNumbers.h           |  144 +
 .../template_files/gpu/MemoryAccessVectors.h  |  137 +
 .../gpu/MemoryAccessWavefunctions.h           |  169 +
 .../template_files/gpu/MemoryAccessWeights.h  |  149 +
 .../iolibs/template_files/gpu/MemoryBuffers.h |  606 ++
 .../gpu/RamboSamplingKernels.cc               |  183 +
 .../template_files/gpu/RamboSamplingKernels.h |  134 +
 .../template_files/gpu/RandomNumberKernels.h  |  191 +
 .../iolibs/template_files/gpu/check_sa.cc     | 1243 +++
 .../iolibs/template_files/gpu/color_sum.cc    |  418 +
 .../iolibs/template_files/gpu/color_sum.h     |  102 +
 .../iolibs/template_files/gpu/coloramps.h     |   71 +
 .../template_files/gpu/constexpr_math.h       |  334 +
 .../iolibs/template_files/gpu/counters.cc     |   93 +
 .../template_files/gpu/cpp_hel_amps_cc.inc    |   13 +
 .../template_files/gpu/cpp_hel_amps_h.inc     |   35 +
 .../iolibs/template_files/gpu/cudacpp.mk      | 1242 +++
 .../template_files/gpu/cudacpp_config.mk      |   97 +
 .../template_files/gpu/cudacpp_overlay.mk     |  295 +
 .../iolibs/template_files/gpu/cudacpp_src.mk  |  185 +
 .../iolibs/template_files/gpu/cudacpp_test.mk |   50 +
 .../template_files/gpu/epoch_process_id.h     |   16 +
 .../iolibs/template_files/gpu/fbridge.cc      |  160 +
 .../iolibs/template_files/gpu/fbridge.h       |   52 +
 .../iolibs/template_files/gpu/fbridge.inc     |  105 +
 .../template_files/gpu/fbridge_common.inc     |   31 +
 .../iolibs/template_files/gpu/fcheck_sa.f     |   90 +
 .../iolibs/template_files/gpu/fsampler.cc     |  165 +
 .../iolibs/template_files/gpu/fsampler.inc    |   42 +
 .../template_files/gpu/makefile_wrapper.mk    |    3 +
 .../iolibs/template_files/gpu/mgOnGpuConfig.h |  297 +
 .../template_files/gpu/mgOnGpuCxtypes.h       |  744 ++
 .../template_files/gpu/mgOnGpuFptypes.h       |  101 +
 .../template_files/gpu/mgOnGpuVectors.h       |  931 +++
 .../madgraph/iolibs/template_files/gpu/nvtx.h |   74 +
 .../template_files/gpu/ompnumthreads.cc       |   25 +
 .../iolibs/template_files/gpu/ompnumthreads.h |   63 +
 .../iolibs/template_files/gpu/perf.py         |  351 +
 .../iolibs/template_files/gpu/processConfig.h |   16 +
 .../iolibs/template_files/gpu/process_cc.inc  |   92 +
 .../template_files/gpu/process_class.inc      |   84 +
 .../gpu/process_function_definitions.inc      |  877 ++
 .../iolibs/template_files/gpu/process_h.inc   |  137 +
 .../template_files/gpu/process_matrix.inc     |   50 +
 .../gpu/process_sigmaKin_function.inc         |  333 +
 .../iolibs/template_files/gpu/profile.sh      |  187 +
 .../iolibs/template_files/gpu/rambo.h         |  191 +
 .../iolibs/template_files/gpu/runTest.cc      |  449 ++
 .../iolibs/template_files/gpu/smatrix_multi.f |   99 +
 .../iolibs/template_files/gpu/testmisc.cc     |  511 ++
 .../iolibs/template_files/gpu/testxxx.cc      |  455 ++
 .../template_files/gpu/testxxx_cc_ref.txt     | 4036 ++++++++++
 .../iolibs/template_files/gpu/timer.h         |   72 +
 .../iolibs/template_files/gpu/timermap.h      |  161 +
 .../iolibs/template_files/gpu/umami.cc        |  530 ++
 .../iolibs/template_files/gpu/umami.h         |  212 +
 .../iolibs/template_files/gpu/valgrind.h      | 7170 +++++++++++++++++
 .../madevent_makefile_source_addon            |    4 +
 .../iolibs/template_files/read_slha.cc        |  249 +
 .../iolibs/template_files/read_slha.h         |   51 +
 PLUGIN/CUDACPP_OUTPUT/model_handling.py       | 2554 ++++++
 PLUGIN/CUDACPP_OUTPUT/output.py               |  528 ++
 PLUGIN/CUDACPP_OUTPUT/trex.py                 |  820 ++
 133 files changed, 50842 insertions(+)
 create mode 120000 PLUGIN/CUDACPP_OUTPUT/AUTHORS
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/CHANGELOG.md
 create mode 120000 PLUGIN/CUDACPP_OUTPUT/COPYING
 create mode 120000 PLUGIN/CUDACPP_OUTPUT/COPYING.LESSER
 create mode 120000 PLUGIN/CUDACPP_OUTPUT/COPYRIGHT
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_driver.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_rex_src.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_runner.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/rex.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_driver.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_cc.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_h.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/__init__.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/create_acceptance_from_file.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/madgraph_launch.template
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_eemumua_float
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_heft_ggh_double
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_pptt_mixed
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_vector_size
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.py
 create mode 100755 PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.template
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.cu
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/launch_plugin.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/.clang-format
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/AUTHORS
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/CMakeLists.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Compilers.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Macros.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Platforms.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists_P.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/src/CMakeLists.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING.LESSER
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumbers.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessChannelIds.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessIflavorVec.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWeights.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/epoch_process_id.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge_common.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/makefile_wrapper.mk
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/nvtx.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/perf.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/profile.sh
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx_cc_ref.txt
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timer.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/valgrind.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/madevent_makefile_source_addon
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.cc
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.h
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/model_handling.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/output.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/trex.py

diff --git a/PLUGIN/CUDACPP_OUTPUT/AUTHORS b/PLUGIN/CUDACPP_OUTPUT/AUTHORS
new file mode 120000
index 0000000000..be479a3263
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/AUTHORS
@@ -0,0 +1 @@
+madgraph/iolibs/template_files/AUTHORS
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/CHANGELOG.md b/PLUGIN/CUDACPP_OUTPUT/CHANGELOG.md
new file mode 100644
index 0000000000..652e68d055
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/CHANGELOG.md
@@ -0,0 +1,67 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is loosely based on [Keep a Changelog](https://keepachangelog.com).
+
+--------------------------------------------------------------------------------
+
+## [Unreleased] - 2024-10-06
+
+### Changed
+
+- Updated cudacpp version to 1.00.01.
+
+### Fixed
+
+- Platform-specific issues
+  - AV ([#1011]) Added workaround for Floating Point Exceptions in vxxxxx in the HIP backend.
+
+- Infrastructure issues
+  - AV ([#1013]) Fix release scripts to create 'v1.00.01' tags from a '(1,0,1)' python tuple.
+  - AV ([#1015]) Remove add_input_for_banner from output.py (plugin_run_card is not needed in cudacpp).
+  - AV ([#995]) In cudacpp_config.mk move default FPTYPE from 'd' to 'm' (already the default floating_type in run_card.dat).
+
+--------------------------------------------------------------------------------
+
+## [1.00.00] - 2024-10-03
+
+### Added
+
+- (OM+AV+SR+SH+ZW+JT+DM) First release of the MG5aMC CUDACPP plugin.
+  - Validated and released for MG5aMC version 3.6.0.
+  - Hosted in the https://github.com/madgraph5/madgraph4gpu original repo.
+  - Repo uses the original directory structure (plugin is epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT).
+
+### Known issues
+
+- This section lists some of the main new issues identified in release v1.00.00.
+
+- General issues
+  - ([#959]) Cross-section instabilities when changing vector size between 32 and 16384.
+  - ([#993]) LHE file mismatch (fortran vs cudacpp) in the experimental multi-backend gridpacks.
+
+- Platform-specific issues
+  - ([#1011]) Floating Point Exceptions in vxxxxx in the HIP backend.
+
+- Physics-process-specific issues
+  - ([#944]) Cross-section mismatch (fortran vs cudacpp) in Drell-Yan plus 4 jets.
+  - ([#942]) Floating Point Exceptions in Drell-Yan plus 0 to 2 jets (workaround: `CUDACPP_RUNTIME_DISABLEFPE=1`).
+  - ([#846]) ME mismatch (HRDCOD=1 vs HRDCOD=1) in EWdim6 models.
+  - ([#601]) Builds fail with very complex final states (e.g. gg to ttgggg).
+
+--------------------------------------------------------------------------------
+
+[1.00.00]: https://github.com/madgraph5/madgraph4gpu/releases/tag/cudacpp_for3.6.0_v1.00.00
+[Unreleased]: https://github.com/madgraph5/madgraph4gpu/releases/compare/cudacpp_for3.6.0_v1.00.00...HEAD
+
+[#601]: https://github.com/madgraph5/madgraph4gpu/issues/601
+[#846]: https://github.com/madgraph5/madgraph4gpu/issues/846
+[#942]: https://github.com/madgraph5/madgraph4gpu/issues/942
+[#944]: https://github.com/madgraph5/madgraph4gpu/issues/944
+[#959]: https://github.com/madgraph5/madgraph4gpu/issues/959
+[#993]: https://github.com/madgraph5/madgraph4gpu/issues/993
+[#995]: https://github.com/madgraph5/madgraph4gpu/issues/995
+[#1011]: https://github.com/madgraph5/madgraph4gpu/issues/1011
+[#1013]: https://github.com/madgraph5/madgraph4gpu/issues/1013
+[#1015]: https://github.com/madgraph5/madgraph4gpu/issues/1015
diff --git a/PLUGIN/CUDACPP_OUTPUT/COPYING b/PLUGIN/CUDACPP_OUTPUT/COPYING
new file mode 120000
index 0000000000..c8358bae73
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/COPYING
@@ -0,0 +1 @@
+madgraph/iolibs/template_files/COPYING
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/COPYING.LESSER b/PLUGIN/CUDACPP_OUTPUT/COPYING.LESSER
new file mode 120000
index 0000000000..d6948159ab
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/COPYING.LESSER
@@ -0,0 +1 @@
+madgraph/iolibs/template_files/COPYING.LESSER
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/COPYRIGHT b/PLUGIN/CUDACPP_OUTPUT/COPYRIGHT
new file mode 120000
index 0000000000..79efcd566c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/COPYRIGHT
@@ -0,0 +1 @@
+madgraph/iolibs/template_files/COPYRIGHT
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.cc b/PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.cc
new file mode 100644
index 0000000000..4d3dd70a5b
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.cc
@@ -0,0 +1,6258 @@
+/***
+ *    ______
+ *    | ___ \
+ *    | |_/ /_____  __
+ *    |    // _ \ \/ /
+ *    | |\ \  __/>  <
+ *    \_| \_\___/_/\_\
+ *
+ ***/
+//
+// *R*apid *e*vent e*x*traction Version 1.0.0
+// Rex is a C++ library for parsing and manipulating Les Houches Event-format (LHE) files.
+// It is designed to fast and lightweight, in comparison to internal parsers in programs like MadGraph.
+// Currently, Rex is in development and may not contain all features necessary for full LHE parsing.
+//
+// Copyright © 2023-2025 CERN, CERN Author Zenny Wettersten.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// All rights not expressly granted are reserved.
+//
+
+#ifndef _REX_CC_
+#define _REX_CC_
+
+#include "Rex.h"
+
+namespace REX
+{
+
+    std::string to_upper(const std::string &str)
+    {
+        std::string result = str;
+        std::transform(result.begin(), result.end(), result.begin(), ::toupper);
+        return result;
+    }
+
+    // Generic warning function for printing warnings without throwing anything
+    void warning(std::string message)
+    {
+        std::cout << "\n\033[1;33mWarning: ";
+        std::cout << message;
+        std::cout << "\033[0m\n";
+    }
+
+    // Explicit instantiation of templated functions and structs/classes
+    template std::shared_ptr<std::vector<size_t>> ind_sort<int>(const std::vector<int> &vector, std::function<bool(const int &, const int &)> comp);
+    template std::shared_ptr<std::vector<size_t>> ind_sort<double>(const std::vector<double> &vector, std::function<bool(const double &, const double &)> comp);
+
+    template int ctoi<std::string>(std::string str);
+    template int ctoi<std::string_view>(std::string_view str);
+
+    template double ctod<std::string>(std::string str);
+    template double ctod<std::string_view>(std::string_view str);
+
+    template std::shared_ptr<std::vector<double>> vec_elem_mult(const std::vector<double> &vec1, const std::vector<double> &vec2);
+    template std::shared_ptr<std::vector<float>> vec_elem_mult(const std::vector<float> &vec1, const std::vector<float> &vec2);
+    template std::shared_ptr<std::vector<int>> vec_elem_mult(const std::vector<int> &vec1, const std::vector<int> &vec2);
+
+    template std::vector<int> subvector<int>(std::vector<int> original, size_t begin, size_t end);
+    template std::vector<size_t> subvector<size_t>(std::vector<size_t> original, size_t begin, size_t end);
+    template std::vector<short int> subvector<short int>(std::vector<short int> original, size_t begin, size_t end);
+    template std::vector<long int> subvector<long int>(std::vector<long int> original, size_t begin, size_t end);
+    template std::vector<double> subvector<double>(std::vector<double> original, size_t begin, size_t end);
+    template std::vector<float> subvector<float>(std::vector<float> original, size_t begin, size_t end);
+    template std::vector<std::string> subvector<std::string>(std::vector<std::string> original, size_t begin, size_t end);
+    template std::vector<std::string_view> subvector<std::string_view>(std::vector<std::string_view> original, size_t begin, size_t end);
+
+    template struct arrN<short int, 2>;
+    template struct arrN<long int, 2>;
+    template struct arrN<int, 2>;
+    template struct arrN<float, 2>;
+    template struct arrN<double, 2>;
+    template struct arrN<short int, 3>;
+    template struct arrN<long int, 3>;
+    template struct arrN<int, 3>;
+    template struct arrN<float, 3>;
+    template struct arrN<double, 3>;
+    template struct arrN<short int, 4>;
+    template struct arrN<long int, 4>;
+    template struct arrN<int, 4>;
+    template struct arrN<float, 4>;
+    template struct arrN<double, 4>;
+
+    template struct arrNRef<short int, 2>;
+    template struct arrNRef<long int, 2>;
+    template struct arrNRef<int, 2>;
+    template struct arrNRef<float, 2>;
+    template struct arrNRef<double, 2>;
+    template struct arrNRef<short int, 3>;
+    template struct arrNRef<long int, 3>;
+    template struct arrNRef<int, 3>;
+    template struct arrNRef<float, 3>;
+    template struct arrNRef<double, 3>;
+    template struct arrNRef<short int, 4>;
+    template struct arrNRef<long int, 4>;
+    template struct arrNRef<int, 4>;
+    template struct arrNRef<float, 4>;
+    template struct arrNRef<double, 4>;
+
+    template struct vecArrN<short int, 2>;
+    template struct vecArrN<long int, 2>;
+    template struct vecArrN<int, 2>;
+    template struct vecArrN<float, 2>;
+    template struct vecArrN<double, 2>;
+    template struct vecArrN<short int, 3>;
+    template struct vecArrN<long int, 3>;
+    template struct vecArrN<int, 3>;
+    template struct vecArrN<float, 3>;
+    template struct vecArrN<double, 3>;
+    template struct vecArrN<short int, 4>;
+    template struct vecArrN<long int, 4>;
+    template struct vecArrN<int, 4>;
+    template struct vecArrN<float, 4>;
+    template struct vecArrN<double, 4>;
+
+    // Parton getters
+    arr4<double> &parton::momenta() { return this->momenta_; }
+    const arr4<double> &parton::momenta() const { return this->momenta_; }
+    arr4<double> &parton::momentum() { return this->momenta_; }
+    const arr4<double> &parton::momentum() const { return this->momenta_; }
+    arr4<double> &parton::pUP() { return this->momenta_; }
+    const arr4<double> &parton::pUP() const { return this->momenta_; }
+    arr4<double> &parton::p() { return this->momenta_; }
+    const arr4<double> &parton::p() const { return this->momenta_; }
+    arr4<double> &parton::mom() { return this->momenta_; }
+    const arr4<double> &parton::mom() const { return this->momenta_; }
+    double &parton::E() { return this->momenta_[0]; }
+    const double &parton::E() const { return this->momenta_[0]; }
+    double &parton::t() { return this->momenta_[0]; }
+    const double &parton::t() const { return this->momenta_[0]; }
+    double &parton::px() { return this->momenta_[1]; }
+    const double &parton::px() const { return this->momenta_[1]; }
+    double &parton::x() { return this->momenta_[1]; }
+    const double &parton::x() const { return this->momenta_[1]; }
+    double &parton::py() { return this->momenta_[2]; }
+    const double &parton::py() const { return this->momenta_[2]; }
+    double &parton::y() { return this->momenta_[2]; }
+    const double &parton::y() const { return this->momenta_[2]; }
+    double &parton::pz() { return this->momenta_[3]; }
+    const double &parton::pz() const { return this->momenta_[3]; }
+    double &parton::z() { return this->momenta_[3]; }
+    const double &parton::z() const { return this->momenta_[3]; }
+    double &parton::m() { return this->mass_; }
+    const double &parton::m() const { return this->mass_; }
+    double &parton::mass() { return this->mass_; }
+    const double &parton::mass() const { return this->mass_; }
+    double &parton::vtim() { return this->vtim_; }
+    const double &parton::vtim() const { return this->vtim_; }
+    double &parton::vTimUP() { return this->vtim_; }
+    const double &parton::vTimUP() const { return this->vtim_; }
+    double &parton::spin() { return this->spin_; }
+    const double &parton::spin() const { return this->spin_; }
+    double &parton::spinUP() { return this->spin_; }
+    const double &parton::spinUP() const { return this->spin_; }
+    long int &parton::pdg() { return this->pdg_; }
+    const long int &parton::pdg() const { return this->pdg_; }
+    long int &parton::idUP() { return this->pdg_; }
+    const long int &parton::idUP() const { return this->pdg_; }
+    long int &parton::id() { return this->pdg_; }
+    const long int &parton::id() const { return this->pdg_; }
+    short int &parton::status() { return this->status_; }
+    const short int &parton::status() const { return this->status_; }
+    short int &parton::iStUP() { return this->status_; }
+    const short int &parton::iStUP() const { return this->status_; }
+    short int &parton::iSt() { return this->status_; }
+    const short int &parton::iSt() const { return this->status_; }
+    arr2<short int> &parton::mother() { return this->mother_; }
+    const arr2<short int> &parton::mother() const { return this->mother_; }
+    arr2<short int> &parton::mothUP() { return this->mother_; }
+    const arr2<short int> &parton::mothUP() const { return this->mother_; }
+    arr2<short int> &parton::moth() { return this->mother_; }
+    const arr2<short int> &parton::moth() const { return this->mother_; }
+    arr2<short int> &parton::icol() { return this->icol_; }
+    const arr2<short int> &parton::icol() const { return this->icol_; }
+    arr2<short int> &parton::iColUP() { return this->icol_; }
+    const arr2<short int> &parton::iColUP() const { return this->icol_; }
+    arr2<short int> &parton::iCol() { return this->icol_; }
+    const arr2<short int> &parton::iCol() const { return this->icol_; }
+
+    parton &parton::set_momenta(const arr4<double> &mom)
+    {
+        this->momenta_ = mom;
+        return *this;
+    }
+    parton &parton::set_mom(const arr4<double> &mom) { return this->set_momenta(mom); }
+    parton &parton::set_pUP(const arr4<double> &mom) { return this->set_momenta(mom); }
+    parton &parton::set_p(const arr4<double> &mom) { return this->set_momenta(mom); }
+
+    parton &parton::set_E(double E)
+    {
+        this->momenta_[0] = E;
+        return *this;
+    }
+    parton &parton::set_t(double pt) { return this->set_E(pt); }
+
+    parton &parton::set_x(double x)
+    {
+        this->momenta_[1] = x;
+        return *this;
+    }
+    parton &parton::set_px(double px) { return this->set_x(px); }
+
+    parton &parton::set_y(double y)
+    {
+        this->momenta_[2] = y;
+        return *this;
+    }
+    parton &parton::set_py(double py) { return this->set_y(py); }
+
+    parton &parton::set_z(double z)
+    {
+        this->momenta_[3] = z;
+        return *this;
+    }
+    parton &parton::set_pz(double pz) { return this->set_z(pz); }
+
+    parton &parton::set_mass(double m)
+    {
+        this->mass_ = m;
+        return *this;
+    }
+
+    parton &parton::set_vtim(double v)
+    {
+        this->vtim_ = v;
+        return *this;
+    }
+
+    parton &parton::set_vTimUP(double v)
+    {
+        return this->set_vtim(v);
+    }
+
+    parton &parton::set_spin(double s)
+    {
+        this->spin_ = s;
+        return *this;
+    }
+
+    parton &parton::set_spinUP(double s)
+    {
+        return this->set_spin(s);
+    }
+
+    parton &parton::set_pdg(long int p)
+    {
+        this->pdg_ = p;
+        return *this;
+    }
+
+    parton &parton::set_id(long int id)
+    {
+        return this->set_pdg(id);
+    }
+
+    parton &parton::set_idUP(long int id)
+    {
+        return this->set_pdg(id);
+    }
+
+    parton &parton::set_status(short int st)
+    {
+        this->status_ = st;
+        return *this;
+    }
+
+    parton &parton::set_iStUP(short int st)
+    {
+        return this->set_status(st);
+    }
+
+    parton &parton::set_iSt(short int st)
+    {
+        return this->set_status(st);
+    }
+
+    parton &parton::set_mother(const arr2<short int> &mth)
+    {
+        this->mother_ = mth;
+        return *this;
+    }
+
+    parton &parton::set_mothUP(const arr2<short int> &mth)
+    {
+        return this->set_mother(mth);
+    }
+
+    parton &parton::set_moth(const arr2<short int> &mth)
+    {
+        return this->set_mother(mth);
+    }
+
+    parton &parton::set_mother(const short int m1, const short int m2)
+    {
+        this->mother_[0] = m1;
+        this->mother_[1] = m2;
+        return *this;
+    }
+
+    parton &parton::set_mothUP(const short int m1, const short int m2)
+    {
+        return this->set_mother(m1, m2);
+    }
+
+    parton &parton::set_moth(const short int m1, const short int m2)
+    {
+        return this->set_mother(m1, m2);
+    }
+
+    parton &parton::set_icol(const arr2<short int> &icol)
+    {
+        this->icol_ = icol;
+        return *this;
+    }
+
+    parton &parton::set_iColUP(const arr2<short int> &icol)
+    {
+        return this->set_icol(icol);
+    }
+
+    parton &parton::set_iCol(const arr2<short int> &col)
+    {
+        return this->set_icol(col);
+    }
+
+    parton &parton::set_icol(const short int i1, const short int i2)
+    {
+        this->icol_[0] = i1;
+        this->icol_[1] = i2;
+        return *this;
+    }
+
+    parton &parton::set_iColUP(const short int i1, const short int i2)
+    {
+        return this->set_icol(i1, i2);
+    }
+
+    parton &parton::set_iCol(const short int i1, const short int i2)
+    {
+        return this->set_icol(i1, i2);
+    }
+
+    // Physical observables
+    double parton::pT() const
+    {
+        return std::sqrt(this->momenta_[1] * this->momenta_[1] + this->momenta_[2] * this->momenta_[2]);
+    }
+    double parton::pT2() const
+    {
+        return this->momenta_[1] * this->momenta_[1] + this->momenta_[2] * this->momenta_[2];
+    }
+    double parton::pL() const
+    {
+        return this->momenta_[3];
+    }
+    double parton::pL2() const
+    {
+        return this->momenta_[3] * this->momenta_[3];
+    }
+    double parton::eT() const
+    {
+        return std::sqrt(this->mass_ * this->mass_ + this->pT2());
+    }
+    double parton::eT2() const
+    {
+        return this->mass_ * this->mass_ + this->pT2();
+    }
+    double parton::phi() const
+    {
+        return std::atan2(this->momenta_[2], this->momenta_[1]);
+    }
+    double parton::theta() const
+    {
+        double p = std::sqrt(this->momenta_[1] * this->momenta_[1] + this->momenta_[2] * this->momenta_[2] + this->momenta_[3] * this->momenta_[3]);
+        if (p == 0.0)
+            return 0.0;
+        return std::acos(this->momenta_[3] / p);
+    }
+    double parton::eta() const
+    {
+        double p = std::sqrt(this->momenta_[1] * this->momenta_[1] + this->momenta_[2] * this->momenta_[2] + this->momenta_[3] * this->momenta_[3]);
+        if (std::abs(p - std::abs(this->momenta_[3])) < 1e-10)
+            return (this->momenta_[3] >= 0.0) ? INFINITY : -INFINITY; // Infinite pseudorapidity for massless particles along the beamline
+
+        return 0.5 * std::log((p + this->momenta_[3]) / (p - this->momenta_[3]));
+    }
+    double parton::rap() const
+    {
+        if (std::abs(this->momenta_[0] - std::abs(this->momenta_[3])) < 1e-10)
+            return (this->momenta_[3] >= 0.0) ? INFINITY : -INFINITY; // Infinite rapidity for massless particles along the beamline
+
+        return 0.5 * std::log((this->momenta_[0] + this->momenta_[3]) / (this->momenta_[0] - this->momenta_[3]));
+    }
+    double parton::mT() const
+    {
+        return std::sqrt(this->mass_ * this->mass_ + this->pT2());
+    }
+    double parton::mT2() const
+    {
+        return this->mass_ * this->mass_ + this->pT2();
+    }
+    double parton::m2() const
+    {
+        return this->mass_ * this->mass_;
+    }
+
+    // Print functions
+    void event::particle::print(std::ostream &os) const
+    {
+        os << std::setprecision(10) << std::scientific << std::noshowpos // Use scientific notation with 11 digits printed, don't include leading +
+           << " " << std::setw(8) << this->pdg_
+           << " " << std::setw(2) << this->status_
+           << " " << std::setw(4) << this->mother_[0]
+           << " " << std::setw(4) << this->mother_[1]
+           << " " << std::setw(4) << this->icol_[0]
+           << " " << std::setw(4) << this->icol_[1]
+           << std::showpos                               // Enable leading + for spatial momenta
+           << " " << std::setw(16) << this->momentum_[1] // Note: momentum ordering in LHEF is (px,py,pz,E) whereas we store it as (E,px,py,pz)
+           << " " << std::setw(16) << this->momentum_[2]
+           << " " << std::setw(16) << this->momentum_[3]
+           << std::noshowpos // Disable leading + for energy, mass, lifetime, spin (should all be positive)
+           << " " << std::setw(16) << this->momentum_[0]
+           << " " << std::setw(16) << this->mass_
+           << std::setprecision(4) // Lower precision for lifetime, spin
+           << " " << std::setw(10) << this->vtim_
+           << " " << std::setw(10) << this->spin_ << "\n";
+    }
+
+    void event::const_particle::print(std::ostream &os) const
+    {
+        os << std::setprecision(10) << std::scientific << std::noshowpos // Use scientific notation with 11 digits printed, don't include leading +
+           << " " << std::setw(8) << this->pdg_
+           << " " << std::setw(2) << this->status_
+           << " " << std::setw(4) << this->mother_[0]
+           << " " << std::setw(4) << this->mother_[1]
+           << " " << std::setw(4) << this->icol_[0]
+           << " " << std::setw(4) << this->icol_[1]
+           << std::showpos                               // Enable leading + for spatial momenta
+           << " " << std::setw(16) << this->momentum_[1] // Note: momentum ordering in LHEF is (px,py,pz,E) whereas we store it as (E,px,py,pz)
+           << " " << std::setw(16) << this->momentum_[2]
+           << " " << std::setw(16) << this->momentum_[3]
+           << std::noshowpos // Disable leading + for energy, mass, lifetime, spin (should all be positive)
+           << " " << std::setw(16) << this->momentum_[0]
+           << " " << std::setw(16) << this->mass_
+           << std::setprecision(4) // Lower precision for lifetime, spin
+           << " " << std::setw(10) << this->vtim_
+           << " " << std::setw(10) << this->spin_ << "\n";
+    }
+
+    event::event(size_t n_particles)
+        : momenta_(n_particles),
+          mass_(n_particles),
+          vtim_(n_particles),
+          spin_(n_particles),
+          pdg_(n_particles),
+          status_(n_particles),
+          mother_(n_particles),
+          icol_(n_particles)
+    {
+        this->n_ = n_particles;
+    }
+
+    event::event(std::vector<parton> particles)
+        : momenta_(particles.size()),
+          mass_(particles.size()),
+          vtim_(particles.size()),
+          spin_(particles.size()),
+          pdg_(particles.size()),
+          status_(particles.size()),
+          mother_(particles.size()),
+          icol_(particles.size())
+    {
+        this->n_ = particles.size();
+        for (size_t i = 0; i < this->n_; ++i)
+        {
+            momenta_[i] = particles[i].momenta_;
+            mass_[i] = particles[i].mass_;
+            vtim_[i] = particles[i].vtim_;
+            spin_[i] = particles[i].spin_;
+            pdg_[i] = particles[i].pdg_;
+            status_[i] = particles[i].status_;
+            mother_[i] = particles[i].mother_;
+            icol_[i] = particles[i].icol_;
+        }
+    }
+
+    void event::print_head(std::ostream &os) const
+    {
+        os << std::setprecision(7) << std::scientific << std::noshowpos
+           << std::left // Left-align n
+           << " " << std::setw(4) << this->n_
+           << std::right // Right-align remaining info
+           << " " << std::setw(3) << this->proc_id_
+           << std::showpos // Enable leading + for weight
+           << " " << std::setw(13) << this->weight_
+           << std::noshowpos << std::setprecision(8) // Disable leading + for scale and couplings
+           << " " << std::setw(14) << this->scale_
+           << " " << std::setw(14) << this->alphaEW_
+           << " " << std::setw(14) << this->alphaS_ << "\n";
+    }
+
+    void event::print_wgts_no_ids(std::ostream &os) const
+    {
+        if (this->wgts_.size() == 0)
+            return;
+        os << std::setprecision(7) << std::scientific << std::showpos;
+        os << "<weights> ";
+        for (const auto &w : this->wgts_)
+        {
+            os << w << " ";
+        }
+        os << "</weights>\n";
+    }
+
+    void event::print_wgts_ids(std::ostream &os) const
+    {
+        if (this->weight_ids->size() > this->wgts_.size())
+        {
+            warning("More weight IDs than weights available. Printing zero weights, which may have incorrect indexing.");
+            // this->wgts_.resize(this->weight_ids->size(), 0.0);
+        }
+        if (this->weight_ids->size() < this->wgts_.size())
+        {
+            for (size_t i = this->weight_ids->size(); i < this->wgts_.size(); ++i)
+            {
+                this->weight_ids->push_back("rwgt_" + std::to_string(i + 1));
+            }
+        }
+        if (this->wgts_.size() == 0)
+            return;
+        os << std::setprecision(7) << std::scientific << std::showpos;
+        os << "<rwgt>";
+        for (size_t i = 0; i < this->weight_ids->size(); ++i)
+        {
+            os << "\n<wgt id=\'" << (*this->weight_ids)[i] << "\'> ";
+            if (i < this->wgts_.size())
+            {
+                os << this->wgts_[i];
+            }
+            else
+            {
+                os << 0.0;
+            }
+            os << " </wgt>";
+        }
+        os << "\n</rwgt>\n";
+    }
+
+    void event::print_wgts(std::ostream &os, bool include_ids) const
+    {
+        if (include_ids)
+        {
+            print_wgts_ids(os);
+        }
+        else
+        {
+            print_wgts_no_ids(os);
+        }
+    }
+
+    double event::gS()
+    {
+        return std::sqrt(4. * pi * alphaS_);
+    }
+
+    double event::get_muF() const
+    {
+        return muF_ != 0.0 ? muF_ : scale_;
+    }
+
+    double event::get_muR() const
+    {
+        return muR_ != 0.0 ? muR_ : scale_;
+    }
+
+    double event::get_muPS() const
+    {
+        return muPS_ != 0.0 ? muPS_ : scale_;
+    }
+
+    size_t &event::nUP() { return n_; }
+    const size_t &event::nUP() const { return n_; }
+    size_t &event::n() { return n_; }
+    const size_t &event::n() const { return n_; }
+
+    long int &event::idPrUP() { return proc_id_; }
+    const long int &event::idPrUP() const { return proc_id_; }
+    long int &event::idPr() { return proc_id_; }
+    const long int &event::idPr() const { return proc_id_; }
+
+    double &event::xWgtUP() { return weight_; }
+    const double &event::xWgtUP() const { return weight_; }
+    double &event::xWgt() { return weight_; }
+    const double &event::xWgt() const { return weight_; }
+    double &event::weight() { return weight_; }
+    const double &event::weight() const { return weight_; }
+
+    double &event::scalUP() { return scale_; }
+    const double &event::scalUP() const { return scale_; }
+    double &event::scale() { return scale_; }
+    const double &event::scale() const { return scale_; }
+    double &event::muF() { return muF_; }
+    const double &event::muF() const { return muF_; }
+    double &event::muR() { return muR_; }
+    const double &event::muR() const { return muR_; }
+    double &event::muPS() { return muPS_; }
+    const double &event::muPS() const { return muPS_; }
+
+    double &event::aQEDUP() { return alphaEW_; }
+    const double &event::aQEDUP() const { return alphaEW_; }
+    double &event::aQED() { return alphaEW_; }
+    const double &event::aQED() const { return alphaEW_; }
+    double &event::alphaEW() { return alphaEW_; }
+    const double &event::alphaEW() const { return alphaEW_; }
+    double &event::aEW() { return alphaEW_; }
+    const double &event::aEW() const { return alphaEW_; }
+
+    double &event::aQCDUP() { return alphaS_; }
+    const double &event::aQCDUP() const { return alphaS_; }
+    double &event::aQCD() { return alphaS_; }
+    const double &event::aQCD() const { return alphaS_; }
+    double &event::alphaS() { return alphaS_; }
+    const double &event::alphaS() const { return alphaS_; }
+    double &event::aS() { return alphaS_; }
+    const double &event::aS() const { return alphaS_; }
+
+    vecArr4<double> &event::pUP() { return momenta_; }
+    const vecArr4<double> &event::pUP() const { return momenta_; }
+    vecArr4<double> &event::p() { return momenta_; }
+    const vecArr4<double> &event::p() const { return momenta_; }
+    vecArr4<double> &event::momenta() { return momenta_; }
+    const vecArr4<double> &event::momenta() const { return momenta_; }
+    vecArr4<double> &event::momentum() { return momenta_; }
+    const vecArr4<double> &event::momentum() const { return momenta_; }
+
+    std::vector<double> &event::mUP() { return mass_; }
+    const std::vector<double> &event::mUP() const { return mass_; }
+    std::vector<double> &event::m() { return mass_; }
+    const std::vector<double> &event::m() const { return mass_; }
+    std::vector<double> &event::mass() { return mass_; }
+    const std::vector<double> &event::mass() const { return mass_; }
+
+    std::vector<double> &event::vTimUP() { return vtim_; }
+    const std::vector<double> &event::vTimUP() const { return vtim_; }
+    std::vector<double> &event::vtim() { return vtim_; }
+    const std::vector<double> &event::vtim() const { return vtim_; }
+    std::vector<double> &event::vTim() { return vtim_; }
+    const std::vector<double> &event::vTim() const { return vtim_; }
+
+    std::vector<double> &event::spinUP() { return spin_; }
+    const std::vector<double> &event::spinUP() const { return spin_; }
+    std::vector<double> &event::spin() { return spin_; }
+    const std::vector<double> &event::spin() const { return spin_; }
+
+    std::vector<long int> &event::idUP() { return pdg_; }
+    const std::vector<long int> &event::idUP() const { return pdg_; }
+    std::vector<long int> &event::id() { return pdg_; }
+    const std::vector<long int> &event::id() const { return pdg_; }
+    std::vector<long int> &event::pdg() { return pdg_; }
+    const std::vector<long int> &event::pdg() const { return pdg_; }
+
+    std::vector<short int> &event::iStUP() { return status_; }
+    const std::vector<short int> &event::iStUP() const { return status_; }
+    std::vector<short int> &event::iSt() { return status_; }
+    const std::vector<short int> &event::iSt() const { return status_; }
+    std::vector<short int> &event::status() { return status_; }
+    const std::vector<short int> &event::status() const { return status_; }
+
+    vecArr2<short int> &event::mothUP() { return mother_; }
+    const vecArr2<short int> &event::mothUP() const { return mother_; }
+    vecArr2<short int> &event::moth() { return mother_; }
+    const vecArr2<short int> &event::moth() const { return mother_; }
+    vecArr2<short int> &event::mother() { return mother_; }
+    const vecArr2<short int> &event::mother() const { return mother_; }
+
+    vecArr2<short int> &event::iColUP() { return icol_; }
+    const vecArr2<short int> &event::iColUP() const { return icol_; }
+    vecArr2<short int> &event::iCol() { return icol_; }
+    const vecArr2<short int> &event::iCol() const { return icol_; }
+    vecArr2<short int> &event::icol() { return icol_; }
+    const vecArr2<short int> &event::icol() const { return icol_; }
+
+    std::vector<double> &event::wgts() { return wgts_; }
+    const std::vector<double> &event::wgts() const { return wgts_; }
+
+    size_t event::n_wgts() const { return wgts_.size(); }
+
+    event &event::set_n(size_t n_particles)
+    {
+        if (n_particles != this->n_)
+        {
+            // Resize all vectors to match the new number of particles
+            // (this may overwrite existing particles)
+            momenta_.resize(n_particles);
+            mass_.resize(n_particles);
+            vtim_.resize(n_particles);
+            spin_.resize(n_particles);
+            pdg_.resize(n_particles);
+            status_.resize(n_particles);
+            mother_.resize(n_particles);
+            icol_.resize(n_particles);
+        }
+        this->n_ = n_particles;
+        return *this;
+    }
+
+    event &event::set_proc_id(long int proc_id)
+    {
+        this->proc_id_ = proc_id;
+        return *this;
+    }
+
+    event &event::set_weight(double weight)
+    {
+        this->weight_ = weight;
+        return *this;
+    }
+
+    event &event::set_scale(double scale)
+    {
+        this->scale_ = scale;
+        this->muF_ = (muF_ == 0.0) ? scale : muF_;    // Set muF if not already set
+        this->muR_ = (muR_ == 0.0) ? scale : muR_;    // Set muR if not already set
+        this->muPS_ = (muPS_ == 0.0) ? scale : muPS_; // Set muPS if not already set
+        return *this;
+    }
+
+    event &event::set_muF(double muF)
+    {
+        this->muF_ = muF;
+        return *this;
+    }
+
+    event &event::set_muR(double muR)
+    {
+        this->muR_ = muR;
+        return *this;
+    }
+
+    event &event::set_muPS(double muPS)
+    {
+        this->muPS_ = muPS;
+        return *this;
+    }
+
+    event &event::set_alphaEW(double alphaEW)
+    {
+        this->alphaEW_ = alphaEW;
+        return *this;
+    }
+
+    event &event::set_alphaS(double alphaS)
+    {
+        this->alphaS_ = alphaS;
+        return *this;
+    }
+
+    event &event::set_momenta(const vecArr4<double> &mom)
+    {
+        this->momenta_ = mom;
+        return *this;
+    }
+
+    event &event::set_momenta(const std::vector<std::array<double, 4>> &mom)
+    {
+        this->momenta_.resize(mom.size());
+        for (size_t i = 0; i < mom.size(); ++i)
+        {
+            this->momenta_[i] = {mom[i][0], mom[i][1], mom[i][2], mom[i][3]};
+        }
+        return *this;
+    }
+
+    event &event::set_mass(const std::vector<double> &m)
+    {
+        this->mass_ = m;
+        return *this;
+    }
+
+    event &event::set_vtim(const std::vector<double> &v)
+    {
+        this->vtim_ = v;
+        return *this;
+    }
+
+    event &event::set_spin(const std::vector<double> &s)
+    {
+        this->spin_ = s;
+        return *this;
+    }
+
+    event &event::set_pdg(const std::vector<long int> &p)
+    {
+        this->pdg_ = p;
+        return *this;
+    }
+
+    event &event::set_status(const std::vector<short int> &st)
+    {
+        this->status_ = st;
+        return *this;
+    }
+
+    event &event::set_mother(const vecArr2<short int> &m)
+    {
+        this->mother_ = m;
+        return *this;
+    }
+
+    event &event::set_mother(const std::vector<std::array<short int, 2>> &m)
+    {
+        this->mother_.resize(m.size());
+        for (size_t i = 0; i < m.size(); ++i)
+        {
+            this->mother_[i] = {m[i][0], m[i][1]};
+        }
+        return *this;
+    }
+
+    event &event::set_icol(const vecArr2<short int> &c)
+    {
+        this->icol_ = c;
+        return *this;
+    }
+
+    event &event::set_icol(const std::vector<std::array<short int, 2>> &c)
+    {
+        this->icol_.resize(c.size());
+        for (size_t i = 0; i < c.size(); ++i)
+        {
+            this->icol_[i] = {c[i][0], c[i][1]};
+        }
+        return *this;
+    }
+
+    event &event::set_wgts(const std::vector<double> &w)
+    {
+        this->wgts_ = w;
+        return *this;
+    }
+
+    event &event::add_wgt(double w, const std::string &id)
+    {
+        this->wgts_.push_back(w);
+        if (!id.empty())
+            this->weight_ids->push_back(id);
+        return *this;
+    }
+
+    event &event::add_particle(const parton &p)
+    {
+        momenta_.push_back(p.momenta_);
+        mass_.push_back(p.mass_);
+        vtim_.push_back(p.vtim_);
+        spin_.push_back(p.spin_);
+        pdg_.push_back(p.pdg_);
+        status_.push_back(p.status_);
+        mother_.push_back(p.mother_);
+        icol_.push_back(p.icol_);
+        ++n_;
+        return *this;
+    }
+
+    event &event::add_particle(const particle &p)
+    {
+        momenta_.push_back(p.momentum_);
+        mass_.push_back(p.mass_);
+        vtim_.push_back(p.vtim_);
+        spin_.push_back(p.spin_);
+        pdg_.push_back(p.pdg_);
+        status_.push_back(p.status_);
+        mother_.push_back(p.mother_);
+        icol_.push_back(p.icol_);
+        ++n_;
+        return *this;
+    }
+
+    event &event::add_particle(const const_particle &p)
+    {
+        momenta_.push_back(p.momentum_);
+        mass_.push_back(p.mass_);
+        vtim_.push_back(p.vtim_);
+        spin_.push_back(p.spin_);
+        pdg_.push_back(p.pdg_);
+        status_.push_back(p.status_);
+        mother_.push_back(p.mother_);
+        icol_.push_back(p.icol_);
+        ++n_;
+        return *this;
+    }
+
+    // Accessors
+    event::particle event::get_particle(size_t i)
+    {
+        return {momenta_.at(i), mass_.at(i), vtim_.at(i), spin_.at(i),
+                pdg_.at(i), status_.at(i), mother_.at(i), icol_.at(i)};
+    }
+
+    event::const_particle event::get_particle(size_t i) const
+    {
+        return {momenta_.at(i), mass_.at(i), vtim_.at(i), spin_.at(i),
+                pdg_.at(i), status_.at(i), mother_.at(i), icol_.at(i)};
+    }
+
+    size_t event::size() const
+    {
+        validate();
+        return momenta_.size();
+    }
+
+    // Iterators
+    event::particle event::particle_iterator::operator*()
+    {
+        return evt->get_particle(index);
+    }
+    event::particle_iterator &event::particle_iterator::operator++()
+    {
+        ++index;
+        return *this;
+    }
+    bool event::particle_iterator::operator!=(const particle_iterator &other) const
+    {
+        return index != other.index;
+    }
+
+    event::const_particle event::const_particle_iterator::operator*() const
+    {
+        return evt->get_particle(index);
+    }
+    event::const_particle_iterator &event::const_particle_iterator::operator++()
+    {
+        ++index;
+        return *this;
+    }
+    bool event::const_particle_iterator::operator!=(const const_particle_iterator &other) const
+    {
+        return index != other.index;
+    }
+
+    event::particle_iterator event::begin() { return {this, 0}; }
+    event::particle_iterator event::end() { return {this, size()}; }
+
+    event::const_particle_iterator event::begin() const { return {this, 0}; }
+    event::const_particle_iterator event::end() const { return {this, size()}; }
+
+    // Particle level accessors and setters
+    arr4Ref<double> event::particle::pUP() { return momentum_; }
+    arr4Ref<const double> event::particle::pUP() const { return arr4Ref<const double>{momentum_.p}; }
+    arr4Ref<double> event::particle::p() { return momentum_; }
+    arr4Ref<const double> event::particle::p() const { return arr4Ref<const double>{momentum_.p}; }
+    arr4Ref<double> event::particle::mom() { return momentum_; }
+    arr4Ref<const double> event::particle::mom() const { return arr4Ref<const double>{momentum_.p}; }
+    arr4Ref<double> event::particle::momentum() { return momentum_; }
+    arr4Ref<const double> event::particle::momentum() const { return arr4Ref<const double>{momentum_.p}; }
+    double &event::particle::E() { return momentum_[0]; }
+    const double &event::particle::E() const { return momentum_[0]; }
+    double &event::particle::t() { return momentum_[0]; }
+    const double &event::particle::t() const { return momentum_[0]; }
+    double &event::particle::x() { return momentum_[1]; }
+    double &event::particle::px() { return momentum_[1]; }
+    const double &event::particle::x() const { return momentum_[1]; }
+    const double &event::particle::px() const { return momentum_[1]; }
+    double &event::particle::y() { return momentum_[2]; }
+    double &event::particle::py() { return momentum_[2]; }
+    const double &event::particle::y() const { return momentum_[2]; }
+    const double &event::particle::py() const { return momentum_[2]; }
+    double &event::particle::z() { return momentum_[3]; }
+    double &event::particle::pz() { return momentum_[3]; }
+    const double &event::particle::z() const { return momentum_[3]; }
+    const double &event::particle::pz() const { return momentum_[3]; }
+    double &event::particle::mUP() { return mass_; }
+    const double &event::particle::mUP() const { return mass_; }
+    double &event::particle::m() { return mass_; }
+    const double &event::particle::m() const { return mass_; }
+    double &event::particle::mass() { return mass_; }
+    const double &event::particle::mass() const { return mass_; }
+    double &event::particle::vTimUP() { return vtim_; }
+    const double &event::particle::vTimUP() const { return vtim_; }
+    double &event::particle::vtim() { return vtim_; }
+    const double &event::particle::vtim() const { return vtim_; }
+    double &event::particle::vTim() { return vtim_; }
+    const double &event::particle::vTim() const { return vtim_; }
+    double &event::particle::spinUP() { return spin_; }
+    const double &event::particle::spinUP() const { return spin_; }
+    double &event::particle::spin() { return spin_; }
+    const double &event::particle::spin() const { return spin_; }
+    long int &event::particle::idUP() { return pdg_; }
+    const long int &event::particle::idUP() const { return pdg_; }
+    long int &event::particle::id() { return pdg_; }
+    const long int &event::particle::id() const { return pdg_; }
+    long int &event::particle::pdg() { return pdg_; }
+    const long int &event::particle::pdg() const { return pdg_; }
+    short int &event::particle::iStUP() { return status_; }
+    const short int &event::particle::iStUP() const { return status_; }
+    short int &event::particle::iSt() { return status_; }
+    const short int &event::particle::iSt() const { return status_; }
+    short int &event::particle::status() { return status_; }
+    const short int &event::particle::status() const { return status_; }
+    arr2Ref<short int> event::particle::mothUP() { return mother_; }
+    const arr2Ref<short int> event::particle::mothUP() const { return mother_; }
+    arr2Ref<short int> event::particle::moth() { return mother_; }
+    const arr2Ref<short int> event::particle::moth() const { return mother_; }
+    arr2Ref<short int> event::particle::mother() { return mother_; }
+    const arr2Ref<short int> event::particle::mother() const { return mother_; }
+    arr2Ref<short int> event::particle::iColUP() { return icol_; }
+    const arr2Ref<short int> event::particle::iColUP() const { return icol_; }
+    arr2Ref<short int> event::particle::iCol() { return icol_; }
+    const arr2Ref<short int> event::particle::iCol() const { return icol_; }
+    arr2Ref<short int> event::particle::icol() { return icol_; }
+    const arr2Ref<short int> event::particle::icol() const { return icol_; }
+
+    // Physical observables
+    double event::particle::pT() const
+    {
+        return std::sqrt(this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2]);
+    }
+    double event::particle::pT2() const
+    {
+        return this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2];
+    }
+    double event::particle::pL() const
+    {
+        return this->momentum_[3];
+    }
+    double event::particle::pL2() const
+    {
+        return this->momentum_[3] * this->momentum_[3];
+    }
+    double event::particle::eT() const
+    {
+        return std::sqrt(this->mass_ * this->mass_ + this->pT2());
+    }
+    double event::particle::eT2() const
+    {
+        return this->mass_ * this->mass_ + this->pT2();
+    }
+    double event::particle::phi() const
+    {
+        return std::atan2(this->momentum_[2], this->momentum_[1]);
+    }
+    double event::particle::theta() const
+    {
+        double p = std::sqrt(this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2] + this->momentum_[3] * this->momentum_[3]);
+        if (p == 0.0)
+            return 0.0;
+        return std::acos(this->momentum_[3] / p);
+    }
+    double event::particle::eta() const
+    {
+        double p = std::sqrt(this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2] + this->momentum_[3] * this->momentum_[3]);
+        if (std::abs(p - std::abs(this->momentum_[3])) < 1e-10)
+            return (this->momentum_[3] >= 0.0) ? INFINITY : -INFINITY; // Infinite pseudorapidity for massless particles along the beamline
+
+        return 0.5 * std::log((p + this->momentum_[3]) / (p - this->momentum_[3]));
+    }
+    double event::particle::rap() const
+    {
+        if (std::abs(this->momentum_[0] - std::abs(this->momentum_[3])) < 1e-10)
+            return (this->momentum_[3] >= 0.0) ? INFINITY : -INFINITY; // Infinite rapidity for massless particles along the beamline
+
+        return 0.5 * std::log((this->momentum_[0] + this->momentum_[3]) / (this->momentum_[0] - this->momentum_[3]));
+    }
+    double event::particle::mT() const
+    {
+        return std::sqrt(this->mass_ * this->mass_ + this->pT2());
+    }
+    double event::particle::mT2() const
+    {
+        return this->mass_ * this->mass_ + this->pT2();
+    }
+    double event::particle::m2() const
+    {
+        return this->mass_ * this->mass_;
+    }
+
+    arr4Ref<const double> event::const_particle::pUP() const
+    {
+        return momentum_;
+    }
+    arr4Ref<const double> event::const_particle::mom() const
+    {
+        return momentum_;
+    }
+
+    arr4Ref<const double> event::const_particle::momentum() const
+    {
+        return momentum_;
+    }
+
+    arr4Ref<const double> event::const_particle::p() const
+    {
+        return momentum_;
+    }
+
+    const double &event::const_particle::E() const { return momentum_[0]; }
+    const double &event::const_particle::t() const { return momentum_[0]; }
+    const double &event::const_particle::x() const { return momentum_[1]; }
+    const double &event::const_particle::px() const { return momentum_[1]; }
+    const double &event::const_particle::y() const { return momentum_[2]; }
+    const double &event::const_particle::py() const { return momentum_[2]; }
+    const double &event::const_particle::z() const { return momentum_[3]; }
+    const double &event::const_particle::pz() const { return momentum_[3]; }
+    const double &event::const_particle::mUP() const { return mass_; }
+    const double &event::const_particle::m() const { return mass_; }
+    const double &event::const_particle::mass() const { return mass_; }
+    const double &event::const_particle::vTimUP() const { return vtim_; }
+    const double &event::const_particle::vtim() const { return vtim_; }
+    const double &event::const_particle::vTim() const { return vtim_; }
+    const double &event::const_particle::spinUP() const { return spin_; }
+    const double &event::const_particle::spin() const { return spin_; }
+    const long int &event::const_particle::idUP() const { return pdg_; }
+    const long int &event::const_particle::id() const { return pdg_; }
+    const long int &event::const_particle::pdg() const { return pdg_; }
+    const short int &event::const_particle::iSt() const { return status_; }
+    const short int &event::const_particle::status() const { return status_; }
+    const short int &event::const_particle::iStUP() const { return status_; }
+    arr2Ref<const short int> event::const_particle::mothUP() const { return mother_; }
+    arr2Ref<const short int> event::const_particle::moth() const { return mother_; }
+    arr2Ref<const short int> event::const_particle::mother() const { return mother_; }
+    arr2Ref<const short int> event::const_particle::iColUP() const { return icol_; }
+    arr2Ref<const short int> event::const_particle::iCol() const { return icol_; }
+    arr2Ref<const short int> event::const_particle::icol() const { return icol_; }
+
+    // Physical observables
+    double event::const_particle::pT() const
+    {
+        return std::sqrt(this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2]);
+    }
+    double event::const_particle::pT2() const
+    {
+        return this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2];
+    }
+    double event::const_particle::pL() const
+    {
+        return this->momentum_[3];
+    }
+    double event::const_particle::pL2() const
+    {
+        return this->momentum_[3] * this->momentum_[3];
+    }
+    double event::const_particle::eT() const
+    {
+        return std::sqrt(this->mass_ * this->mass_ + this->pT2());
+    }
+    double event::const_particle::eT2() const
+    {
+        return this->mass_ * this->mass_ + this->pT2();
+    }
+    double event::const_particle::phi() const
+    {
+        return std::atan2(this->momentum_[2], this->momentum_[1]);
+    }
+    double event::const_particle::theta() const
+    {
+        double p = std::sqrt(this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2] + this->momentum_[3] * this->momentum_[3]);
+        if (p == 0.0)
+            return 0.0;
+        return std::acos(this->momentum_[3] / p);
+    }
+    double event::const_particle::eta() const
+    {
+        double p = std::sqrt(this->momentum_[1] * this->momentum_[1] + this->momentum_[2] * this->momentum_[2] + this->momentum_[3] * this->momentum_[3]);
+        if (std::abs(p - std::abs(this->momentum_[3])) < 1e-10)
+            return (this->momentum_[3] >= 0.0) ? INFINITY : -INFINITY; // Infinite pseudorapidity for massless particles along the beamline
+
+        return 0.5 * std::log((p + this->momentum_[3]) / (p - this->momentum_[3]));
+    }
+    double event::const_particle::rap() const
+    {
+        if (std::abs(this->momentum_[0] - std::abs(this->momentum_[3])) < 1e-10)
+            return (this->momentum_[3] >= 0.0) ? INFINITY : -INFINITY; // Infinite rapidity for massless particles along the beamline
+
+        return 0.5 * std::log((this->momentum_[0] + this->momentum_[3]) / (this->momentum_[0] - this->momentum_[3]));
+    }
+    double event::const_particle::mT() const
+    {
+        return std::sqrt(this->mass_ * this->mass_ + this->pT2());
+    }
+    double event::const_particle::mT2() const
+    {
+        return this->mass_ * this->mass_ + this->pT2();
+    }
+    double event::const_particle::m2() const
+    {
+        return this->mass_ * this->mass_;
+    }
+
+    event::particle &event::particle::set_pdg(long int p)
+    {
+        this->pdg_ = p;
+        return *this;
+    }
+    event::particle &event::particle::set_id(long int id) { return this->set_pdg(id); }
+    event::particle &event::particle::set_idUP(long int id) { return this->set_pdg(id); }
+
+    event::particle &event::particle::set_status(short int s)
+    {
+        this->status_ = s;
+        return *this;
+    }
+    event::particle &event::particle::set_iSt(short int s) { return this->set_status(s); }
+    event::particle &event::particle::set_iStUP(short int s) { return this->set_status(s); }
+
+    event::particle &event::particle::set_mother(short int i, short int j)
+    {
+        this->mother_ = {i, j};
+        return *this;
+    }
+    event::particle &event::particle::set_mother(const arr2<short int> &m)
+    {
+        this->mother_ = m;
+        return *this;
+    }
+    event::particle &event::particle::set_moth(short int i, short int j) { return this->set_mother(i, j); }
+    event::particle &event::particle::set_moth(const arr2<short int> &m) { return this->set_mother(m); }
+    event::particle &event::particle::set_mothUP(short int i, short int j) { return this->set_mother(i, j); }
+    event::particle &event::particle::set_mothUP(const arr2<short int> &m) { return this->set_mother(m); }
+
+    event::particle &event::particle::set_icol(short int i, short int c)
+    {
+        this->icol_ = {i, c};
+        return *this;
+    }
+    event::particle &event::particle::set_icol(const arr2<short int> &c)
+    {
+        this->icol_ = c;
+        return *this;
+    }
+    event::particle &event::particle::set_iColUP(short int i, short int c) { return this->set_icol(i, c); }
+    event::particle &event::particle::set_iColUP(const arr2<short int> &c) { return this->set_icol(c); }
+
+    event::particle &event::particle::set_momentum(double e, double px, double py, double pz)
+    {
+        this->momentum_ = {e, px, py, pz};
+        return *this;
+    }
+    event::particle &event::particle::set_momentum(const arr4<double> &mom)
+    {
+        this->momentum_ = mom;
+        return *this;
+    }
+    event::particle &event::particle::set_mom(double e, double px, double py, double pz) { return this->set_momentum(e, px, py, pz); }
+    event::particle &event::particle::set_mom(const arr4<double> &mom) { return this->set_momentum(mom); }
+    event::particle &event::particle::set_pUP(double e, double px, double py, double pz) { return this->set_momentum(e, px, py, pz); }
+    event::particle &event::particle::set_pUP(const arr4<double> &mom) { return this->set_momentum(mom); }
+    event::particle &event::particle::set_p(double e, double px, double py, double pz) { return this->set_momentum(e, px, py, pz); }
+    event::particle &event::particle::set_p(const arr4<double> &mom) { return this->set_momentum(mom); }
+
+    event::particle &event::particle::set_E(double e)
+    {
+        this->momentum_[0] = e;
+        return *this;
+    }
+    event::particle &event::particle::set_px(double px)
+    {
+        this->momentum_[1] = px;
+        return *this;
+    }
+    event::particle &event::particle::set_py(double py)
+    {
+        this->momentum_[2] = py;
+        return *this;
+    }
+    event::particle &event::particle::set_pz(double pz)
+    {
+        this->momentum_[3] = pz;
+        return *this;
+    }
+
+    event::particle &event::particle::set_x(double x) { return this->set_px(x); }
+    event::particle &event::particle::set_y(double y) { return this->set_py(y); }
+    event::particle &event::particle::set_z(double z) { return this->set_pz(z); }
+    event::particle &event::particle::set_t(double pt) { return this->set_E(pt); }
+
+    event::particle &event::particle::set_mass(double m)
+    {
+        this->mass_ = m;
+        return *this;
+    }
+    event::particle &event::particle::set_mUP(double m) { return this->set_mass(m); }
+    event::particle &event::particle::set_m(double m) { return this->set_mass(m); }
+
+    event::particle &event::particle::set_vtim(double v)
+    {
+        this->vtim_ = v;
+        return *this;
+    }
+    event::particle &event::particle::set_vTimUP(double v) { return this->set_vtim(v); }
+
+    event::particle &event::particle::set_spin(double s)
+    {
+        this->spin_ = s;
+        return *this;
+    }
+    event::particle &event::particle::set_spinUP(double s) { return this->set_spin(s); }
+
+    event::particle event::operator[](size_t i)
+    {
+        if (i >= size())
+            throw std::out_of_range("event::operator[] index out of range");
+        return get_particle(i);
+    }
+
+    event::particle event::at(size_t i)
+    {
+        if (i >= size())
+            throw std::out_of_range("event::at index out of range");
+        return get_particle(i);
+    }
+
+    event::const_particle event::operator[](size_t i) const
+    {
+        if (i >= size())
+            throw std::out_of_range("event::operator[] index out of range");
+        return get_particle(i);
+    }
+
+    event::const_particle event::at(size_t i) const
+    {
+        if (i >= size())
+            throw std::out_of_range("event::at index out of range");
+        return get_particle(i);
+    }
+
+    void event::validate() const
+    {
+        size_t s = this->n_; // number of partons in the event
+        if (s == 0)
+        {
+            // If there are no particles, all vectors should be empty
+            if (!momenta_.empty() || !mass_.empty() || !vtim_.empty() ||
+                !spin_.empty() || !pdg_.empty() || !status_.empty() ||
+                !mother_.empty() || !icol_.empty())
+            {
+                throw std::runtime_error("event::validate() failed: event has no particles, but vectors are not empty");
+            }
+            return; // Nothing to validate
+        }
+        auto check = [s](const auto &vec, const char *name)
+        {
+            if (vec.size() != s)
+            {
+                std::ostringstream oss;
+                oss << "event::validate() failed: '" << name
+                    << "' has size " << vec.size() << ", expected " << s;
+                throw std::runtime_error(oss.str());
+            }
+        };
+
+        check(momenta_, "momenta");
+        check(mass_, "mass");
+        check(vtim_, "vtim");
+        check(spin_, "spin");
+        check(pdg_, "pdg");
+        check(status_, "status");
+        check(mother_, "mother");
+        check(icol_, "icol");
+    }
+
+    // Protected shared comparator function
+    namespace
+    {
+        std::shared_ptr<cevent_equal_fn> event_equal_ptr =
+            std::make_shared<cevent_equal_fn>(external_legs_const_comparator);
+        std::mutex event_equal_mutex;
+    }
+
+    bool extra_fields_equal(const std::unordered_map<std::string, std::any> &a,
+                            const std::unordered_map<std::string, std::any> &b)
+    {
+        if (a.size() != b.size())
+            return false;
+
+        for (const auto &[key, val_a] : a)
+        {
+            auto it = b.find(key);
+            if (it == b.end())
+                return false;
+
+            const std::any &val_b = it->second;
+
+            if (val_a.type() != val_b.type())
+                return false;
+
+            // Only compare known types
+            if (val_a.type() == typeid(int))
+            {
+                if (std::any_cast<int>(val_a) != std::any_cast<int>(val_b))
+                    return false;
+            }
+            else if (val_a.type() == typeid(double))
+            {
+                if (std::any_cast<double>(val_a) != std::any_cast<double>(val_b))
+                    return false;
+            }
+            else if (val_a.type() == typeid(std::string))
+            {
+                if (std::any_cast<std::string>(val_a) != std::any_cast<std::string>(val_b))
+                    return false;
+            }
+            else
+            {
+                warning("Unknown type for key: " + key + ", skipping (assuming equality)");
+                return true; // Skip unknown types
+            }
+        }
+
+        return true;
+    }
+
+    event &event::set_indices()
+    {
+        if (this->indices.empty())
+        {
+            this->indices.resize(this->size());
+            std::iota(this->indices.begin(), this->indices.end(), 0);
+        }
+        return *this;
+    }
+
+    // Helper: key for (status, pdg)
+    struct StatusPdgKey
+    {
+        short st;
+        long pdg;
+        bool operator==(const StatusPdgKey &o) const noexcept
+        {
+            return st == o.st && pdg == o.pdg;
+        }
+    };
+
+    struct StatusPdgKeyHash
+    {
+        size_t operator()(const StatusPdgKey &k) const noexcept
+        {
+            // Cheap hash combine
+            // Cast to unsigned to avoid UB on shifts of negative values.
+            const auto a = static_cast<uint64_t>(static_cast<uint16_t>(k.st));
+            const auto b = static_cast<uint64_t>(static_cast<uint64_t>(k.pdg));
+            // 16 bits for status, rest for pdg; then mix
+            uint64_t h = (a << 48) ^ (b + 0x9e3779b97f4a7c15ULL + (a << 6) + (a >> 2));
+            return static_cast<size_t>(h);
+        }
+    };
+
+    event &event::set_indices(const event &other_event, bool fail_on_mismatch)
+    {
+        this->validate();
+        other_event.validate();
+
+        const size_t n_this = this->size();
+        const size_t n_other = other_event.size();
+        if (fail_on_mismatch && n_this != n_other)
+        {
+            throw std::runtime_error("event::set_indices: mismatched sizes");
+        }
+
+        // (status, pdg) -> queue of positions in other_event (in other’s order)
+        std::unordered_map<StatusPdgKey, std::queue<size_t>, StatusPdgKeyHash> pos_by_key;
+        pos_by_key.reserve(n_other * 2);
+        for (size_t j = 0; j < n_other; ++j)
+        {
+            pos_by_key[StatusPdgKey{other_event.status_.at(j), other_event.pdg_.at(j)}].push(j);
+        }
+
+        // Fill indices so that indices[other_pos] = this_pos
+        this->indices.assign(n_this, npos);
+        for (size_t i = 0; i < n_this; ++i)
+        {
+            StatusPdgKey k{this->status_.at(i), this->pdg_.at(i)};
+            auto it = pos_by_key.find(k);
+            if (it == pos_by_key.end() || it->second.empty())
+            {
+                if (fail_on_mismatch)
+                    throw std::runtime_error("event::set_indices: no remaining match for key");
+                continue;
+            }
+            const size_t j = it->second.front();
+            it->second.pop();
+            this->indices[j] = i; // <-- the important line
+        }
+        // Optional: ensure none left unmatched and no npos remain
+        if (fail_on_mismatch)
+        {
+            for (const auto &kv : pos_by_key)
+            {
+                if (!kv.second.empty())
+                {
+                    throw std::runtime_error("event::set_indices: surplus particles in other_event");
+                }
+            }
+        }
+
+        std::vector<size_t> override_indices;
+        for (size_t k = 0; k < this->indices.size(); ++k)
+        {
+            if (fail_on_mismatch && this->indices[k] == npos)
+            {
+                throw std::runtime_error("event::set_indices: internal error, unfilled index");
+            }
+            if (this->indices[k] != npos)
+            {
+                override_indices.push_back(this->indices[k]);
+            }
+        }
+        this->set_indices(override_indices);
+        return *this;
+    }
+
+    event &event::set_indices(const std::vector<size_t> &idxs)
+    {
+        if (idxs.size() > this->size())
+        {
+            throw std::runtime_error("event::set_indices: size mismatch");
+        }
+        this->indices = idxs;
+        return *this;
+    }
+
+    void event::print_extra(std::ostream &os) const
+    {
+        for (const auto &[key, val] : this->extra)
+        {
+            if (val.type() == typeid(int))
+            {
+                os << "<" << key << "\">" << std::any_cast<int>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(double))
+            {
+                os << std::setprecision(10) << std::scientific;
+                os << "<" << key << "\">" << std::any_cast<double>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(std::string))
+            {
+                os << "<" << key << "\">" << std::any_cast<std::string>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(std::string_view))
+            {
+                os << "<" << key << "\">" << std::any_cast<std::string_view>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(xmlNode))
+            {
+                std::any_cast<xmlNode>(val).write(os);
+                os << "\n";
+            }
+            else if (val.type() == typeid(std::shared_ptr<xmlNode>))
+            {
+                std::any_cast<std::shared_ptr<xmlNode>>(val)->write(os);
+                os << "\n";
+            }
+            else
+            {
+                warning("Unknown type for extra field: " + key + ", skipping print");
+            }
+        }
+    }
+
+    void event::print_scales(std::ostream &os) const
+    {
+        bool scales = false;
+        std::string scale_str = "<scales";
+        if (this->muF_ != 0.0 && (this->scale_ - this->muF_) / this->scale_ > 1e-6)
+        {
+            scales = true;
+            scale_str += " muf=\'" + std::to_string(this->muF_) + "\'";
+        }
+        if (this->muR_ != 0.0 && (this->scale_ - this->muR_) / this->scale_ > 1e-6)
+        {
+            scales = true;
+            scale_str += " mur=\'" + std::to_string(this->muR_) + "\'";
+        }
+        if (this->muPS_ != 0.0 && (this->scale_ - this->muPS_) / this->scale_ > 1e-6)
+        {
+            scales = true;
+            scale_str += " muPS=\'" + std::to_string(this->muPS_) + "\'";
+        }
+        if (scales)
+        {
+            scale_str += ">\n";
+            os << scale_str;
+        }
+    }
+
+    void event::print(std::ostream &os, bool include_ids) const
+    {
+        os << "\n<event>\n";
+        this->print_head(os);
+        for (auto prt : *this)
+        {
+            prt.print(os);
+        }
+        this->print_scales(os);
+        this->print_extra(os);
+        this->print_wgts(os, include_ids);
+        os << "</event>";
+    }
+
+    bool default_event_equal(const event &lhs, const event &rhs)
+    {
+        return lhs.momenta_ == rhs.momenta_ &&
+               lhs.mass_ == rhs.mass_ &&
+               lhs.vtim_ == rhs.vtim_ &&
+               lhs.spin_ == rhs.spin_ &&
+               lhs.pdg_ == rhs.pdg_ &&
+               lhs.status_ == rhs.status_ &&
+               lhs.mother_ == rhs.mother_ &&
+               lhs.icol_ == rhs.icol_ &&
+               lhs.n_ == rhs.n_ &&
+               lhs.proc_id_ == rhs.proc_id_ &&
+               lhs.weight_ == rhs.weight_ &&
+               lhs.scale_ == rhs.scale_ &&
+               lhs.alphaEW_ == rhs.alphaEW_ &&
+               lhs.alphaS_ == rhs.alphaS_ &&
+               extra_fields_equal(lhs.extra, rhs.extra);
+    }
+
+    bool operator==(const event &lhs, const event &rhs)
+    {
+        std::shared_ptr<cevent_equal_fn> fn;
+        {
+            std::lock_guard<std::mutex> lock(event_equal_mutex);
+            fn = event_equal_ptr;
+        }
+        return (*fn)(lhs, rhs);
+    }
+
+    bool operator!=(const event &lhs, const event &rhs)
+    {
+        return !(lhs == rhs);
+    }
+
+    void set_event_comparator(cevent_equal_fn fn)
+    {
+        std::lock_guard<std::mutex> lock(event_equal_mutex);
+        event_equal_ptr = std::make_shared<cevent_equal_fn>(std::move(fn));
+    }
+
+    void reset_event_comparator()
+    {
+        set_event_comparator(external_legs_const_comparator);
+    }
+
+    bool external_legs_comparator(event &a, event &b)
+    {
+        auto count_pdgs = [](const event &e, int status_filter)
+        {
+            std::unordered_map<int, int> pdg_counts;
+            for (size_t i = 0; i < e.size(); ++i)
+            {
+                if (e.status_[i] == status_filter)
+                {
+                    ++pdg_counts[e.pdg_[i]];
+                }
+            }
+            return pdg_counts;
+        };
+
+        // Count initial and final particles separately
+        auto init_a = count_pdgs(a, -1);
+        auto init_b = count_pdgs(b, -1);
+        auto final_a = count_pdgs(a, +1);
+        auto final_b = count_pdgs(b, +1);
+
+        return init_a == init_b && final_a == final_b;
+    }
+
+    bool external_legs_const_comparator(const event &a, const event &b)
+    {
+        auto count_pdgs = [](const event &e, int status_filter)
+        {
+            std::unordered_map<int, int> pdg_counts;
+            for (size_t i = 0; i < e.size(); ++i)
+            {
+                if (e.status_[i] == status_filter)
+                {
+                    ++pdg_counts[e.pdg_[i]];
+                }
+            }
+            return pdg_counts;
+        };
+
+        // Count initial and final particles separately
+        auto init_a = count_pdgs(a, -1);
+        auto init_b = count_pdgs(b, -1);
+        auto final_a = count_pdgs(a, +1);
+        auto final_b = count_pdgs(b, +1);
+
+        return init_a == init_b && final_a == final_b;
+    }
+
+    bool always_true(const event &a, const event &b)
+    {
+        UNUSED(a);
+        UNUSED(b);
+        return true;
+    }
+
+    struct particleKey
+    {
+        double mass = 0.0, vtim = 0.0, spin = 0.0;
+        arr4<double> momentum{};
+        long int pdg = 0;
+        short int status = 0;
+        arr2<short int> mother{}, icol{};
+
+        bool operator==(const particleKey &other) const
+        {
+            return mass == other.mass && vtim == other.vtim && spin == other.spin &&
+                   momentum == other.momentum && pdg == other.pdg && status == other.status &&
+                   mother == other.mother && icol == other.icol;
+        }
+
+        bool operator<(const particleKey &other) const
+        {
+            return std::tie(mass, vtim, spin, momentum, pdg, status, mother, icol) <
+                   std::tie(other.mass, other.vtim, other.spin, other.momentum,
+                            other.pdg, other.status, other.mother, other.icol);
+        }
+        std::tuple<
+            std::optional<double>,
+            std::optional<double>,
+            std::optional<double>,
+            std::array<std::optional<double>, 4>,
+            std::optional<long int>,
+            std::optional<short int>,
+            std::optional<arr2<short int>>,
+            std::optional<arr2<short int>>>
+        sort_key(const eventComparatorConfig &cfg) const
+        {
+            return {
+                cfg.compare_mass ? std::make_optional(mass) : std::nullopt,
+                cfg.compare_vtim ? std::make_optional(vtim) : std::nullopt,
+                cfg.compare_spin ? std::make_optional(spin) : std::nullopt,
+                {cfg.compare_momentum_E ? std::make_optional(momentum[0]) : std::nullopt,
+                 cfg.compare_momentum_x ? std::make_optional(momentum[1]) : std::nullopt,
+                 cfg.compare_momentum_y ? std::make_optional(momentum[2]) : std::nullopt,
+                 cfg.compare_momentum_z ? std::make_optional(momentum[3]) : std::nullopt},
+                cfg.compare_pdg ? std::make_optional(pdg) : std::nullopt,
+                cfg.compare_status ? std::make_optional(status) : std::nullopt,
+                cfg.compare_mother ? std::make_optional(mother) : std::nullopt,
+                cfg.compare_icol ? std::make_optional(icol) : std::nullopt};
+        }
+    };
+
+    inline bool nearly_equal_rel(double a, double b, double tol)
+    {
+        if (a == b)
+            return true;
+        double denom = std::abs(a) + std::abs(b);
+        return denom == 0.0 ? false : std::abs(a - b) / denom < tol;
+    }
+
+    event_equal_fn eventComparatorConfig::make_comparator() const
+    {
+        return [*this](const event &a, const event &b) -> bool
+        {
+            auto extract_keys = [&](const event &ev)
+            {
+                std::vector<particleKey> keys;
+                for (size_t i = 0; i < ev.size(); ++i)
+                {
+                    if (!status_filter.empty() && !status_filter.count(ev.status_[i]))
+                        continue;
+
+                    particleKey key;
+                    if (compare_mass)
+                        key.mass = ev.mass_[i];
+                    if (compare_vtim)
+                        key.vtim = ev.vtim_[i];
+                    if (compare_spin)
+                        key.spin = ev.spin_[i];
+
+                    if (compare_momentum)
+                    {
+                        key.momentum = {
+                            compare_momentum_E ? ev.momenta_[i][0] : 0.0,
+                            compare_momentum_x ? ev.momenta_[i][1] : 0.0,
+                            compare_momentum_y ? ev.momenta_[i][2] : 0.0,
+                            compare_momentum_z ? ev.momenta_[i][3] : 0.0};
+                    }
+
+                    if (compare_pdg)
+                        key.pdg = ev.pdg_[i];
+                    if (compare_status)
+                        key.status = ev.status_[i];
+                    if (compare_mother)
+                        key.mother = ev.mother_[i];
+                    if (compare_icol)
+                        key.icol = ev.icol_[i];
+
+                    keys.push_back(std::move(key));
+                }
+                std::sort(keys.begin(), keys.end(), [&](const particleKey &a, const particleKey &b)
+                          { return a.sort_key(*this) < b.sort_key(*this); });
+                return keys;
+            };
+
+            bool local_compare_n = compare_n;
+            if (!status_filter.empty() && compare_n)
+            {
+                warning("compare_n=true with status_filter active — ignoring compare_n.");
+                // Force-disable to prevent false mismatches
+                local_compare_n = false;
+            }
+
+            auto a_keys = extract_keys(a);
+            auto b_keys = extract_keys(b);
+            if (a_keys.size() != b_keys.size())
+                return false;
+
+            for (size_t i = 0; i < a_keys.size(); ++i)
+            {
+                const auto &ka = a_keys[i];
+                const auto &kb = b_keys[i];
+                if (compare_mass && !nearly_equal_rel(ka.mass, kb.mass, mass_tol))
+                    return false;
+                if (compare_vtim && !nearly_equal_rel(ka.vtim, kb.vtim, vtim_tol))
+                    return false;
+                if (compare_spin && !nearly_equal_rel(ka.spin, kb.spin, spin_tol))
+                    return false;
+                if (compare_momentum)
+                {
+                    if (compare_momentum_E && !nearly_equal_rel(ka.momentum[0], kb.momentum[0], momentum_tol))
+                        return false;
+                    if (compare_momentum_x && !nearly_equal_rel(ka.momentum[1], kb.momentum[1], momentum_tol))
+                        return false;
+                    if (compare_momentum_y && !nearly_equal_rel(ka.momentum[2], kb.momentum[2], momentum_tol))
+                        return false;
+                    if (compare_momentum_z && !nearly_equal_rel(ka.momentum[3], kb.momentum[3], momentum_tol))
+                        return false;
+                }
+
+                if (compare_pdg && ka.pdg != kb.pdg)
+                    return false;
+                if (compare_status && ka.status != kb.status)
+                    return false;
+                if (compare_mother && ka.mother != kb.mother)
+                    return false;
+                if (compare_icol && ka.icol != kb.icol)
+                    return false;
+            }
+            if (local_compare_n && a.n_ != b.n_)
+                return false;
+            if (compare_proc_id && a.proc_id_ != b.proc_id_)
+                return false;
+            if (compare_weight && !nearly_equal_rel(a.weight_, b.weight_, weight_tol))
+                return false;
+            if (compare_scale && !nearly_equal_rel(a.scale_, b.scale_, scale_tol))
+                return false;
+            if (compare_alphaEW && !nearly_equal_rel(a.alphaEW_, b.alphaEW_, alphaEW_tol))
+                return false;
+            if (compare_alphaS && !nearly_equal_rel(a.alphaS_, b.alphaS_, alphaS_tol))
+                return false;
+            return true;
+        };
+    }
+
+    cevent_equal_fn eventComparatorConfig::make_const_comparator() const
+    {
+        return [*this](const event &a, const event &b) -> bool
+        {
+            auto extract_keys = [&](const event &ev)
+            {
+                std::vector<particleKey> keys;
+                for (size_t i = 0; i < ev.size(); ++i)
+                {
+                    if (!status_filter.empty() && !status_filter.count(ev.status_[i]))
+                        continue;
+
+                    particleKey key;
+                    if (compare_mass)
+                        key.mass = ev.mass_[i];
+                    if (compare_vtim)
+                        key.vtim = ev.vtim_[i];
+                    if (compare_spin)
+                        key.spin = ev.spin_[i];
+
+                    if (compare_momentum)
+                    {
+                        key.momentum = {
+                            compare_momentum_E ? ev.momenta_[i][0] : 0.0,
+                            compare_momentum_x ? ev.momenta_[i][1] : 0.0,
+                            compare_momentum_y ? ev.momenta_[i][2] : 0.0,
+                            compare_momentum_z ? ev.momenta_[i][3] : 0.0};
+                    }
+
+                    if (compare_pdg)
+                        key.pdg = ev.pdg_[i];
+                    if (compare_status)
+                        key.status = ev.status_[i];
+                    if (compare_mother)
+                        key.mother = ev.mother_[i];
+                    if (compare_icol)
+                        key.icol = ev.icol_[i];
+
+                    keys.push_back(std::move(key));
+                }
+                std::sort(keys.begin(), keys.end(), [&](const particleKey &a, const particleKey &b)
+                          { return a.sort_key(*this) < b.sort_key(*this); });
+                return keys;
+            };
+
+            bool local_compare_n = compare_n;
+            if (!status_filter.empty() && compare_n)
+            {
+                warning("compare_n=true with status_filter active — ignoring compare_n.");
+                // Force-disable to prevent false mismatches
+                local_compare_n = false;
+            }
+
+            auto a_keys = extract_keys(a);
+            auto b_keys = extract_keys(b);
+            if (a_keys.size() != b_keys.size())
+                return false;
+
+            for (size_t i = 0; i < a_keys.size(); ++i)
+            {
+                const auto &ka = a_keys[i];
+                const auto &kb = b_keys[i];
+                if (compare_mass && !nearly_equal_rel(ka.mass, kb.mass, mass_tol))
+                    return false;
+                if (compare_vtim && !nearly_equal_rel(ka.vtim, kb.vtim, vtim_tol))
+                    return false;
+                if (compare_spin && !nearly_equal_rel(ka.spin, kb.spin, spin_tol))
+                    return false;
+                if (compare_momentum)
+                {
+                    if (compare_momentum_E && !nearly_equal_rel(ka.momentum[0], kb.momentum[0], momentum_tol))
+                        return false;
+                    if (compare_momentum_x && !nearly_equal_rel(ka.momentum[1], kb.momentum[1], momentum_tol))
+                        return false;
+                    if (compare_momentum_y && !nearly_equal_rel(ka.momentum[2], kb.momentum[2], momentum_tol))
+                        return false;
+                    if (compare_momentum_z && !nearly_equal_rel(ka.momentum[3], kb.momentum[3], momentum_tol))
+                        return false;
+                }
+
+                if (compare_pdg && ka.pdg != kb.pdg)
+                    return false;
+                if (compare_status && ka.status != kb.status)
+                    return false;
+                if (compare_mother && ka.mother != kb.mother)
+                    return false;
+                if (compare_icol && ka.icol != kb.icol)
+                    return false;
+            }
+            if (local_compare_n && a.n_ != b.n_)
+                return false;
+            if (compare_proc_id && a.proc_id_ != b.proc_id_)
+                return false;
+            if (compare_weight && !nearly_equal_rel(a.weight_, b.weight_, weight_tol))
+                return false;
+            if (compare_scale && !nearly_equal_rel(a.scale_, b.scale_, scale_tol))
+                return false;
+            if (compare_alphaEW && !nearly_equal_rel(a.alphaEW_, b.alphaEW_, alphaEW_tol))
+                return false;
+            if (compare_alphaS && !nearly_equal_rel(a.alphaS_, b.alphaS_, alphaS_tol))
+                return false;
+            return true;
+        };
+    }
+
+    eventComparatorConfig compare_legs_only()
+    {
+        return eventComparatorConfig{}
+            .set_mass(false)
+            .set_pdg(true)
+            .set_status(false)
+            .set_spin(false)
+            .set_vtim(false)
+            .set_momentum(false)
+            .set_status_filter(std::vector<int>{+1, -1})
+            .set_n(false)
+            .set_proc_id(false)
+            .set_weight(false)
+            .set_scale(false)
+            .set_alphaEW(false)
+            .set_alphaS(false)
+            .set_mother(false)
+            .set_icol(false);
+    }
+
+    eventComparatorConfig compare_final_state_only()
+    {
+        return eventComparatorConfig{}
+            .set_mass(true)
+            .set_pdg(true)
+            .set_momentum(true)
+            .set_status_filter(+1)
+            .set_vtim(true)
+            .set_spin(true)
+            .set_mother(true)
+            .set_icol(true);
+    }
+
+    eventComparatorConfig compare_physics_fields()
+    {
+        return eventComparatorConfig{}
+            .set_mass(true)
+            .set_pdg(true)
+            .set_status(true)
+            .set_spin(true)
+            .set_vtim(true)
+            .set_momentum(true)
+            .set_n(true);
+    }
+
+    eventBelongs::eventBelongs(const event &e)
+    {
+        this->events.push_back(std::make_shared<event>(e));
+        this->comparator = external_legs_comparator;
+        this->const_comparator = external_legs_const_comparator;
+    }
+
+    eventBelongs::eventBelongs(std::shared_ptr<event> e)
+    {
+        this->events.push_back(e);
+        this->comparator = external_legs_comparator;
+        this->const_comparator = external_legs_const_comparator;
+    }
+
+    eventBelongs::eventBelongs(std::vector<event> evs)
+    {
+        this->events = {};
+        for (const auto &e : evs)
+        {
+            this->events.push_back(std::make_shared<event>(e));
+        }
+        this->comparator = external_legs_comparator;
+        this->const_comparator = external_legs_const_comparator;
+    }
+
+    eventBelongs::eventBelongs(std::vector<std::shared_ptr<event>> evs)
+    {
+        this->events = evs;
+        this->comparator = external_legs_comparator;
+        this->const_comparator = external_legs_const_comparator;
+    }
+
+    eventBelongs::eventBelongs(const event &e, event_equal_fn comp)
+    {
+        this->events.push_back(std::make_shared<event>(e));
+        this->comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(std::shared_ptr<event> e, event_equal_fn comp)
+    {
+        this->events.push_back(e);
+        this->comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(std::vector<event> evs, event_equal_fn comp)
+    {
+        this->events = {};
+        for (const auto &e : evs)
+        {
+            this->events.push_back(std::make_shared<event>(e));
+        }
+        this->comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(std::vector<std::shared_ptr<event>> evs, event_equal_fn comp)
+    {
+        this->events = evs;
+        this->comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(const event &e, cevent_equal_fn comp)
+    {
+        this->events.push_back(std::make_shared<event>(e));
+        this->comparator = comp;
+        this->const_comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(std::shared_ptr<event> e, cevent_equal_fn comp)
+    {
+        this->events.push_back(e);
+        this->comparator = comp;
+        this->const_comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(std::vector<event> evs, cevent_equal_fn comp)
+    {
+        this->events = {};
+        for (const auto &e : evs)
+        {
+            this->events.push_back(std::make_shared<event>(e));
+        }
+        this->comparator = comp;
+        this->const_comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(std::vector<std::shared_ptr<event>> evs, cevent_equal_fn comp)
+    {
+        this->events = evs;
+        this->comparator = comp;
+        this->const_comparator = std::move(comp);
+    }
+
+    eventBelongs::eventBelongs(const event &e, event_equal_fn comp, cevent_equal_fn ccomp)
+    {
+        this->events.push_back(std::make_shared<event>(e));
+        this->comparator = std::move(comp);
+        this->const_comparator = std::move(ccomp);
+    }
+
+    eventBelongs::eventBelongs(std::shared_ptr<event> e, event_equal_fn comp, cevent_equal_fn ccomp)
+    {
+        this->events.push_back(e);
+        this->comparator = std::move(comp);
+        this->const_comparator = std::move(ccomp);
+    }
+
+    eventBelongs::eventBelongs(std::vector<event> evs, event_equal_fn comp, cevent_equal_fn ccomp)
+    {
+        this->events = {};
+        for (const auto &e : evs)
+        {
+            this->events.push_back(std::make_shared<event>(e));
+        }
+        this->comparator = std::move(comp);
+        this->const_comparator = std::move(ccomp);
+    }
+
+    eventBelongs::eventBelongs(std::vector<std::shared_ptr<event>> evs, event_equal_fn comp, cevent_equal_fn ccomp)
+    {
+        this->events = evs;
+        this->comparator = std::move(comp);
+        this->const_comparator = std::move(ccomp);
+    }
+
+    eventBelongs &eventBelongs::add_event(const event &e)
+    {
+        this->events.push_back(std::make_shared<event>(e));
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::add_event(std::shared_ptr<event> e)
+    {
+        this->events.push_back(e);
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::add_event(const std::vector<event> &evs)
+    {
+        for (const auto &e : evs)
+        {
+            this->events.push_back(std::make_shared<event>(e));
+        }
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::add_event(std::vector<std::shared_ptr<event>> evs)
+    {
+        for (const auto &e : evs)
+        {
+            this->events.push_back(e);
+        }
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_events(const event &e)
+    {
+        this->events.clear();
+        this->events.push_back(std::make_shared<event>(e));
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_events(std::shared_ptr<event> e)
+    {
+        this->events.clear();
+        this->events.push_back(e);
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_events(const std::vector<event> &evs)
+    {
+        this->events = {};
+        for (const auto &e : evs)
+        {
+            this->events.push_back(std::make_shared<event>(e));
+        }
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_events(std::vector<std::shared_ptr<event>> evs)
+    {
+        this->events = evs;
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_comparator(event_equal_fn comp)
+    {
+        this->comparator = std::move(comp);
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_comparator(cevent_equal_fn comp)
+    {
+        this->comparator = comp;
+        this->const_comparator = std::move(comp);
+        return *this;
+    }
+
+    eventBelongs &eventBelongs::set_comparator(const eventComparatorConfig &cfg)
+    {
+        this->comparator = cfg.make_comparator();
+        this->const_comparator = cfg.make_const_comparator();
+        return *this;
+    }
+
+    // If non-const and an event belongs, set its indices automatically
+    bool eventBelongs::belongs_mutable(event &e)
+    {
+        if (this->events.empty() && !(this->comparator || this->const_comparator))
+        {
+            throw std::runtime_error("eventBelongs::belongs() called with no events set");
+        }
+        if (!this->comparator)
+        {
+            if (!this->const_comparator)
+                throw std::runtime_error("eventBelongs::belongs() called with no comparator set");
+            // Fallback to const comparator if available
+            return this->belongs_const(static_cast<const event &>(e));
+        }
+        for (auto &ev : this->events)
+        {
+            if (this->comparator(*ev, e))
+            {
+                e.set_indices(*ev);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool eventBelongs::belongs_const(const event &e) const
+    {
+        if (this->events.empty())
+        {
+            throw std::runtime_error("eventBelongs::belongs() called with no events set");
+        }
+        if (!this->const_comparator)
+        {
+            throw std::runtime_error("eventBelongs::belongs() called with no const_comparator set");
+        }
+        for (const auto &ev : this->events)
+        {
+            if (this->const_comparator(*ev, e))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool eventBelongs::belongs(event &e)
+    {
+        return this->belongs_mutable(e);
+    }
+
+    bool eventBelongs::belongs(const event &e) const
+    {
+        return this->belongs_const(e);
+    }
+
+    bool eventBelongs::belongs(std::shared_ptr<event> e)
+    {
+        return this->belongs_mutable(*e);
+    }
+
+    event_bool_fn eventBelongs::get_event_bool()
+    {
+        return [this](event &e) -> bool
+        {
+            return this->belongs_mutable(e);
+        };
+    }
+
+    cevent_bool_fn eventBelongs::get_const_event_bool() const
+    {
+        return [this](const event &e) -> bool
+        {
+            return this->belongs_const(e);
+        };
+    }
+
+    eventBelongs all_events_belong()
+    {
+        event e1(0);
+        return eventBelongs(e1, always_true, always_true);
+    }
+
+    eventSorter::eventSorter(const eventBelongs &e_set)
+    {
+        this->event_sets.push_back(std::make_shared<eventBelongs>(e_set));
+        this->comparators.push_back(this->event_sets.back()->get_event_bool());
+        this->const_comparators.push_back(this->event_sets.back()->get_const_event_bool());
+    }
+
+    eventSorter::eventSorter(event_bool_fn comp)
+    {
+        this->comparators.push_back(std::move(comp));
+    }
+
+    eventSorter::eventSorter(cevent_bool_fn comp)
+    {
+        this->comparators.push_back([comp](event &e)
+                                    { return comp(e); });
+        this->const_comparators.push_back(std::move(comp));
+    }
+
+    eventSorter::eventSorter(event_bool_fn comp, cevent_bool_fn ccomp)
+    {
+        this->comparators.push_back(std::move(comp));
+        this->const_comparators.push_back(std::move(ccomp));
+    }
+
+    eventSorter::eventSorter(std::vector<eventBelongs> e_sets)
+    {
+        this->event_sets = {};
+        for (const auto &es : e_sets)
+        {
+            this->event_sets.push_back(std::make_shared<eventBelongs>(es));
+            this->comparators.push_back(this->event_sets.back()->get_event_bool());
+            this->const_comparators.push_back(this->event_sets.back()->get_const_event_bool());
+        }
+    }
+
+    eventSorter::eventSorter(std::vector<event_bool_fn> comps)
+    {
+        this->comparators = std::move(comps);
+    }
+
+    eventSorter::eventSorter(std::vector<event_bool_fn> comps, std::vector<cevent_bool_fn> ccomps)
+    {
+        if (comps.size() != ccomps.size())
+        {
+            throw std::runtime_error("eventSorter: size mismatch in constructor");
+        }
+        this->comparators = std::move(comps);
+        this->const_comparators = std::move(ccomps);
+    }
+
+    eventSorter &eventSorter::add_event_set(const eventBelongs &e_set)
+    {
+        this->event_sets.push_back(std::make_shared<eventBelongs>(e_set));
+        this->comparators.push_back(this->event_sets.back()->get_event_bool());
+        this->const_comparators.push_back(this->event_sets.back()->get_const_event_bool());
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_event_set(const std::vector<eventBelongs> &e_sets)
+    {
+        size_t old_size = this->event_sets.size();
+        for (const auto &es : e_sets)
+        {
+            this->event_sets.push_back(std::make_shared<eventBelongs>(es));
+        }
+        this->comparators.reserve(this->event_sets.size());
+        this->const_comparators.reserve(this->event_sets.size());
+        for (size_t i = old_size; i < this->event_sets.size(); ++i)
+        {
+            auto &es = this->event_sets[i];
+            this->comparators.push_back(es->get_event_bool());
+            this->const_comparators.push_back(es->get_const_event_bool());
+        }
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_bool(event_bool_fn comp)
+    {
+        this->comparators.push_back(std::move(comp));
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_const_bool(cevent_bool_fn comp)
+    {
+        this->comparators.push_back([comp](event &e)
+                                    { return comp(e); });
+        this->const_comparators.push_back(std::move(comp));
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_bool(event_bool_fn comp, cevent_bool_fn ccomp)
+    {
+        this->comparators.push_back(std::move(comp));
+        this->const_comparators.push_back(std::move(ccomp));
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_bool(std::vector<event_bool_fn> comps)
+    {
+        this->comparators.insert(this->comparators.end(), std::make_move_iterator(comps.begin()), std::make_move_iterator(comps.end()));
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_const_bool(std::vector<cevent_bool_fn> ccomps)
+    {
+        for (auto &c : ccomps)
+        {
+            this->comparators.push_back([c](event &e)
+                                        { return c(e); });
+        }
+        this->const_comparators.insert(this->const_comparators.end(), std::make_move_iterator(ccomps.begin()), std::make_move_iterator(ccomps.end()));
+        return *this;
+    }
+
+    eventSorter &eventSorter::add_bool(std::vector<event_bool_fn> comps, std::vector<cevent_bool_fn> ccomps)
+    {
+        if (comps.size() != ccomps.size())
+        {
+            throw std::runtime_error("eventSorter::add_bool: size mismatch");
+        }
+        this->comparators.insert(this->comparators.end(), std::make_move_iterator(comps.begin()), std::make_move_iterator(comps.end()));
+        this->const_comparators.insert(this->const_comparators.end(), std::make_move_iterator(ccomps.begin()), std::make_move_iterator(ccomps.end()));
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_event_sets(const eventBelongs &e_set)
+    {
+        this->event_sets.clear();
+        this->event_sets.push_back(std::make_shared<eventBelongs>(e_set));
+        this->comparators.clear();
+        this->const_comparators.clear();
+        this->comparators.push_back(this->event_sets.back()->get_event_bool());
+        this->const_comparators.push_back(this->event_sets.back()->get_const_event_bool());
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_event_sets(const std::vector<eventBelongs> &e_sets)
+    {
+        this->event_sets = {};
+        this->comparators.clear();
+        this->const_comparators.clear();
+        this->event_sets.reserve(e_sets.size());
+        for (const auto &es : e_sets)
+        {
+            this->event_sets.push_back(std::make_shared<eventBelongs>(es));
+            this->comparators.push_back(this->event_sets.back()->get_event_bool());
+            this->const_comparators.push_back(this->event_sets.back()->get_const_event_bool());
+        }
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_bools(event_bool_fn comp)
+    {
+        this->event_sets.clear();
+        this->comparators = {std::move(comp)};
+        this->const_comparators.clear();
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_const_bools(cevent_bool_fn comp)
+    {
+        this->event_sets.clear();
+        this->comparators = {[comp](event &e)
+                             { return comp(e); }};
+        this->const_comparators = {std::move(comp)};
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_bools(event_bool_fn comp, cevent_bool_fn ccomp)
+    {
+        this->event_sets.clear();
+        this->comparators = {std::move(comp)};
+        this->const_comparators = {std::move(ccomp)};
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_bools(std::vector<event_bool_fn> comps)
+    {
+        this->event_sets.clear();
+        this->comparators = std::move(comps);
+        this->const_comparators.clear();
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_const_bools(std::vector<cevent_bool_fn> ccomps)
+    {
+        this->event_sets.clear();
+        this->comparators = {};
+        for (const auto &c : ccomps)
+        {
+            this->comparators.push_back([c](event &e)
+                                        { return c(e); });
+        }
+        this->const_comparators = std::move(ccomps);
+        return *this;
+    }
+
+    eventSorter &eventSorter::set_bools(std::vector<event_bool_fn> comps, std::vector<cevent_bool_fn> ccomps)
+    {
+        if (comps.size() != ccomps.size())
+        {
+            throw std::runtime_error("eventSorter::set_bools: size mismatch");
+        }
+        this->event_sets.clear();
+        this->comparators = std::move(comps);
+        this->const_comparators = std::move(ccomps);
+        return *this;
+    }
+
+    size_t eventSorter::size() const
+    {
+        if (this->comparators.size() != this->const_comparators.size() && this->const_comparators.size() != 0)
+        {
+            throw std::runtime_error("eventSorter::size(): Inconsistent internal state in eventSorter");
+        }
+        return std::max(this->comparators.size(), this->event_sets.size());
+    }
+
+    void eventSorter::extract_comparators()
+    {
+        this->comparators.clear();
+        this->const_comparators.clear();
+        for (auto &es : this->event_sets)
+        {
+            this->comparators.push_back(es->get_event_bool());
+            this->const_comparators.push_back(es->get_const_event_bool());
+        }
+    }
+
+    size_t eventSorter::position(event &e)
+    {
+        if (this->comparators.size() < this->event_sets.size())
+            this->extract_comparators();
+
+        for (size_t i = 0; i < this->comparators.size(); ++i)
+        {
+            if (!this->comparators[i])
+                continue;
+            if (this->comparators[i](e))
+                return i;
+        }
+        return npos;
+    }
+
+    size_t eventSorter::position(const event &e) const
+    {
+        if (this->const_comparators.size() < this->event_sets.size())
+            const_cast<eventSorter *>(this)->extract_comparators();
+
+        for (size_t i = 0; i < this->const_comparators.size(); ++i)
+        {
+            if (!this->const_comparators[i])
+                continue;
+            if (this->const_comparators[i](e))
+                return i;
+        }
+        return npos;
+    }
+
+    size_t eventSorter::position(std::shared_ptr<event> e)
+    {
+        if (this->comparators.size() < this->event_sets.size())
+        {
+            this->extract_comparators();
+        }
+        for (size_t i = 0; i < this->comparators.size(); ++i)
+        {
+            if (this->comparators[i](*e))
+            {
+                return i;
+            }
+        }
+        return npos;
+    }
+
+    std::vector<size_t> eventSorter::position(std::vector<event> &evs)
+    {
+        std::vector<size_t> positions(evs.size(), npos);
+        for (size_t i = 0; i < evs.size(); ++i)
+        {
+            positions[i] = this->position(evs[i]);
+        }
+        return positions;
+    }
+
+    std::vector<size_t> eventSorter::position(const std::vector<event> &evs) const
+    {
+        std::vector<size_t> positions(evs.size(), npos);
+        for (size_t i = 0; i < evs.size(); ++i)
+        {
+            positions[i] = this->position(evs[i]);
+        }
+        return positions;
+    }
+
+    std::vector<size_t> eventSorter::position(std::vector<std::shared_ptr<event>> evs)
+    {
+        std::vector<size_t> positions;
+        for (auto e : evs)
+        {
+            positions.push_back(this->position(e));
+        }
+        return positions;
+    }
+
+    std::vector<size_t> eventSorter::sort(std::vector<event> &evs)
+    {
+        return this->position(evs);
+    }
+
+    std::vector<size_t> eventSorter::sort(const std::vector<event> &evs) const
+    {
+        return this->position(evs);
+    }
+
+    std::vector<size_t> eventSorter::sort(std::vector<std::shared_ptr<event>> evts)
+    {
+        return this->position(evts);
+    }
+
+    event_hash_fn eventSorter::get_hash()
+    {
+        return [this](event &e) -> size_t
+        {
+            size_t pos = this->position(e);
+            return pos;
+        };
+    }
+
+    cevent_hash_fn eventSorter::get_const_hash() const
+    {
+        return [this](const event &e) -> size_t
+        {
+            size_t pos = this->position(e);
+            return pos;
+        };
+    }
+
+    eventSorter make_sample_sorter(const std::vector<event> &sample, event_equal_fn comp)
+    {
+        if (sample.empty())
+        {
+            throw std::invalid_argument("Sample vector is empty");
+        }
+        eventSorter sorter;
+        for (const auto &ev : sample)
+        {
+            if (sorter.position(ev) == npos)
+            {
+                sorter.add_event_set(eventBelongs{ev, comp});
+            }
+        }
+        return sorter;
+    }
+
+    eventSorter make_sample_sorter(std::vector<std::shared_ptr<event>> sample, event_equal_fn comp)
+    {
+        if (sample.empty())
+        {
+            throw std::invalid_argument("Sample vector is empty");
+        }
+        eventSorter sorter;
+        for (auto ev : sample)
+        {
+            if (sorter.position(ev) == npos)
+            {
+                sorter.add_event_set(eventBelongs{*ev, comp});
+            }
+        }
+        return sorter;
+    }
+
+    process::process(std::vector<std::shared_ptr<event>> evs, bool filter_partons)
+    {
+        this->filter = filter_partons;
+        this->add_event(evs);
+    }
+
+    process::process(std::vector<event> evs, bool filter_partons)
+    {
+        this->filter = filter_partons;
+        this->add_event(evs);
+    }
+
+    process &process::add_event_raw(const event &ev)
+    {
+        auto summed_n = (this->n_summed.empty() ? 0 : this->n_summed.back()) + ev.n_;
+        this->n_.push_back(ev.n_);
+        this->n_summed.push_back(summed_n);
+        this->proc_id_.push_back(ev.proc_id_);
+        this->weight_.push_back(ev.weight_);
+        this->scale_.push_back(ev.scale_);
+        auto muF = (ev.muF_ == 0.0) ? ev.scale_ : ev.muF_;
+        auto muR = (ev.muR_ == 0.0) ? ev.scale_ : ev.muR_;
+        auto muPS = (ev.muPS_ == 0.0) ? ev.scale_ : ev.muPS_;
+        this->muF_.push_back(muF);
+        this->muR_.push_back(muR);
+        this->muPS_.push_back(muPS);
+        this->alphaEW_.push_back(ev.alphaEW_);
+        this->alphaS_.push_back(ev.alphaS_);
+        for (auto prtcl : ev)
+        {
+            this->momenta_.push_back(prtcl.momentum_);
+            this->mass_.push_back(prtcl.mass_);
+            this->vtim_.push_back(prtcl.vtim_);
+            this->spin_.push_back(prtcl.spin_);
+            this->pdg_.push_back(prtcl.pdg_);
+            this->status_.push_back(prtcl.status_);
+            this->mother_.push_back(prtcl.mother_);
+            this->icol_.push_back(prtcl.icol_);
+        }
+        this->wgts_.push_back(ev.wgts_);
+        this->add_extra(ev.extra);
+        this->events.push_back(std::make_shared<event>(ev));
+        return *this;
+    }
+
+    process &process::add_event_raw(std::shared_ptr<event> ev)
+    {
+        auto summed_n = (this->n_summed.empty() ? 0 : this->n_summed.back()) + ev->n_;
+        this->n_.push_back(ev->n_);
+        this->n_summed.push_back(summed_n);
+        this->proc_id_.push_back(ev->proc_id_);
+        this->weight_.push_back(ev->weight_);
+        this->scale_.push_back(ev->scale_);
+        auto muF = (ev->muF_ == 0.0) ? ev->scale_ : ev->muF_;
+        auto muR = (ev->muR_ == 0.0) ? ev->scale_ : ev->muR_;
+        auto muPS = (ev->muPS_ == 0.0) ? ev->scale_ : ev->muPS_;
+        this->muF_.push_back(muF);
+        this->muR_.push_back(muR);
+        this->muPS_.push_back(muPS);
+        this->alphaEW_.push_back(ev->alphaEW_);
+        this->alphaS_.push_back(ev->alphaS_);
+        for (auto prtcl : *ev)
+        {
+            this->momenta_.push_back(prtcl.momentum_);
+            this->mass_.push_back(prtcl.mass_);
+            this->vtim_.push_back(prtcl.vtim_);
+            this->spin_.push_back(prtcl.spin_);
+            this->pdg_.push_back(prtcl.pdg_);
+            this->status_.push_back(prtcl.status_);
+            this->mother_.push_back(prtcl.mother_);
+            this->icol_.push_back(prtcl.icol_);
+        }
+        this->wgts_.push_back(ev->wgts_);
+        this->add_extra(ev->extra);
+        this->events.push_back(ev);
+        return *this;
+    }
+
+    process &process::add_event_filtered(const event &ev)
+    {
+        auto ev_view = ev.view();
+        auto summed_n = (this->n_summed.empty() ? 0 : this->n_summed.back()) + ev_view.size();
+        this->n_.push_back(ev_view.size());
+        this->n_summed.push_back(summed_n);
+        this->proc_id_.push_back(ev.proc_id_);
+        this->weight_.push_back(ev.weight_);
+        this->scale_.push_back(ev.scale_);
+        auto muF = (ev.muF_ == 0.0) ? ev.scale_ : ev.muF_;
+        auto muR = (ev.muR_ == 0.0) ? ev.scale_ : ev.muR_;
+        auto muPS = (ev.muPS_ == 0.0) ? ev.scale_ : ev.muPS_;
+        this->muF_.push_back(muF);
+        this->muR_.push_back(muR);
+        this->muPS_.push_back(muPS);
+        this->alphaEW_.push_back(ev.alphaEW_);
+        this->alphaS_.push_back(ev.alphaS_);
+        for (auto prtcl : ev_view)
+        {
+            this->momenta_.push_back(prtcl.momentum_);
+            this->mass_.push_back(prtcl.mass_);
+            this->vtim_.push_back(prtcl.vtim_);
+            this->spin_.push_back(prtcl.spin_);
+            this->pdg_.push_back(prtcl.pdg_);
+            this->status_.push_back(prtcl.status_);
+            this->mother_.push_back(prtcl.mother_);
+            this->icol_.push_back(prtcl.icol_);
+        }
+        this->wgts_.push_back(ev.wgts_);
+        this->add_extra(ev.extra);
+        this->events.push_back(std::make_shared<event>(ev));
+        return *this;
+    }
+
+    process &process::add_event_filtered(std::shared_ptr<event> ev)
+    {
+        auto ev_view = ev->view();
+        auto summed_n = (this->n_summed.empty() ? 0 : this->n_summed.back()) + ev_view.size();
+        this->n_.push_back(ev_view.size());
+        this->n_summed.push_back(summed_n);
+        this->proc_id_.push_back(ev->proc_id_);
+        this->weight_.push_back(ev->weight_);
+        this->scale_.push_back(ev->scale_);
+        auto muF = (ev->muF_ == 0.0) ? ev->scale_ : ev->muF_;
+        auto muR = (ev->muR_ == 0.0) ? ev->scale_ : ev->muR_;
+        auto muPS = (ev->muPS_ == 0.0) ? ev->scale_ : ev->muPS_;
+        this->muF_.push_back(muF);
+        this->muR_.push_back(muR);
+        this->muPS_.push_back(muPS);
+        this->alphaEW_.push_back(ev->alphaEW_);
+        this->alphaS_.push_back(ev->alphaS_);
+        for (auto prtcl : ev_view)
+        {
+            this->momenta_.push_back(prtcl.momentum_);
+            this->mass_.push_back(prtcl.mass_);
+            this->vtim_.push_back(prtcl.vtim_);
+            this->spin_.push_back(prtcl.spin_);
+            this->pdg_.push_back(prtcl.pdg_);
+            this->status_.push_back(prtcl.status_);
+            this->mother_.push_back(prtcl.mother_);
+            this->icol_.push_back(prtcl.icol_);
+        }
+        this->wgts_.push_back(ev->wgts_);
+        this->add_extra(ev->extra);
+        this->events.push_back(ev);
+        return *this;
+    }
+
+    process &process::add_event(const event &ev)
+    {
+        if (this->filter)
+        {
+            return this->add_event_filtered(ev);
+        }
+        else
+        {
+            return this->add_event_raw(ev);
+        }
+    }
+
+    process &process::add_event(std::shared_ptr<event> ev)
+    {
+        if (this->filter)
+        {
+            return this->add_event_filtered(ev);
+        }
+        else
+        {
+            return this->add_event_raw(ev);
+        }
+    }
+
+    process &process::add_event(const std::vector<event> &evs)
+    {
+        for (const auto &ev : evs)
+        {
+            this->add_event(ev);
+        }
+        return *this;
+    }
+
+    process &process::add_event(std::vector<std::shared_ptr<event>> evs)
+    {
+        for (auto ev : evs)
+        {
+            this->add_event(ev);
+        }
+        return *this;
+    }
+
+    std::vector<double> process::E()
+    {
+        std::vector<double> energies;
+        energies.reserve(this->momenta_.size());
+        for (const auto &p : this->momenta_)
+        {
+            energies.push_back(p[0]); // p[0] is defined to be the energy, using the (1,-1,-1,-1) Minkowski metric convention
+        }
+        return energies;
+    }
+    std::vector<double> process::t() { return this->E(); }
+
+    std::vector<double> process::x()
+    {
+        std::vector<double> x_;
+        x_.reserve(this->momenta_.size());
+        for (const auto &p : this->momenta_)
+        {
+            x_.push_back(p[1]); // p[1] is defined to be the x component
+        }
+        return x_;
+    }
+    std::vector<double> process::px() { return this->x(); }
+
+    std::vector<double> process::y()
+    {
+        std::vector<double> y_;
+        y_.reserve(this->momenta_.size());
+        for (const auto &p : this->momenta_)
+        {
+            y_.push_back(p[2]); // p[2] is defined to be the y component
+        }
+        return y_;
+    }
+    std::vector<double> process::py() { return this->y(); }
+
+    std::vector<double> process::z()
+    {
+        std::vector<double> z_;
+        z_.reserve(this->momenta_.size());
+        for (const auto &p : this->momenta_)
+        {
+            z_.push_back(p[3]); // p[3] is defined to be the z component
+        }
+        return z_;
+    }
+    std::vector<double> process::pz() { return this->z(); }
+
+    process &process::set_E(const std::vector<double> &E)
+    {
+        if (E.size() != this->momenta_.size())
+        {
+            throw std::runtime_error("process::set_E: size mismatch");
+        }
+        for (size_t i = 0; i < E.size(); ++i)
+        {
+            this->momenta_[i][0] = E[i];
+        }
+        return *this;
+    }
+    process &process::set_t(const std::vector<double> &pt) { return this->set_E(pt); }
+
+    process &process::set_x(const std::vector<double> &x)
+    {
+        if (x.size() != this->momenta_.size())
+        {
+            throw std::runtime_error("process::set_x: size mismatch");
+        }
+        for (size_t i = 0; i < x.size(); ++i)
+        {
+            this->momenta_[i][1] = x[i];
+        }
+        return *this;
+    }
+    process &process::set_px(const std::vector<double> &px) { return this->set_x(px); }
+
+    process &process::set_y(const std::vector<double> &y)
+    {
+        if (y.size() != this->momenta_.size())
+        {
+            throw std::runtime_error("process::set_y: size mismatch");
+        }
+        for (size_t i = 0; i < y.size(); ++i)
+        {
+            this->momenta_[i][2] = y[i];
+        }
+        return *this;
+    }
+    process &process::set_py(const std::vector<double> &py) { return this->set_y(py); }
+
+    process &process::set_z(const std::vector<double> &z)
+    {
+        if (z.size() != this->momenta_.size())
+        {
+            throw std::runtime_error("process::set_z: size mismatch");
+        }
+        for (size_t i = 0; i < z.size(); ++i)
+        {
+            this->momenta_[i][3] = z[i];
+        }
+        return *this;
+    }
+    process &process::set_pz(const std::vector<double> &pz) { return this->set_z(pz); }
+
+    std::vector<double> process::gS()
+    {
+        std::vector<double> gS_;
+        gS_.resize(this->alphaS_.size());
+        std::transform(this->alphaS_.begin(), this->alphaS_.end(), gS_.begin(), [](double alpha)
+                       { return std::sqrt(4 * pi * alpha); });
+        return gS_;
+    }
+
+    process &process::set_gS(const std::vector<double> &gS)
+    {
+        if (gS.size() != this->alphaS_.size() && !this->alphaS_.empty())
+        {
+            warning("process::set_gS: size mismatch. process::alphaS will be overwritten and may lead to future indexing errors.");
+        }
+        this->alphaS_.clear();
+        this->alphaS_.resize(gS.size());
+        std::transform(gS.begin(), gS.end(), this->alphaS_.begin(), [](double gS)
+                       { return gS * gS / (4. * pi); });
+        return *this;
+    }
+
+    process &process::set_n(const std::vector<size_t> &n)
+    {
+        this->n_ = n;
+        this->n_summed.clear();
+        this->n_summed.reserve(n.size());
+        size_t summed_n = 0;
+        for (const auto &n_i : n)
+        {
+            summed_n += n_i;
+            this->n_summed.push_back(summed_n);
+        }
+        return *this;
+    }
+
+    process &process::set_n_summed(const std::vector<size_t> &n_summed)
+    {
+        this->n_summed = n_summed;
+        return *this;
+    }
+
+    process &process::set_proc_id(const std::vector<long int> &proc_id)
+    {
+        this->proc_id_ = proc_id;
+        return *this;
+    }
+
+    process &process::set_weight(const std::vector<double> &weight)
+    {
+        this->weight_ = weight;
+        return *this;
+    }
+
+    process &process::set_scale(const std::vector<double> &scale)
+    {
+        this->scale_ = scale;
+        return *this;
+    }
+
+    process &process::set_muF(const std::vector<double> &muF)
+    {
+        this->muF_ = muF;
+        return *this;
+    }
+
+    process &process::set_muR(const std::vector<double> &muR)
+    {
+        this->muR_ = muR;
+        return *this;
+    }
+
+    process &process::set_muPS(const std::vector<double> &muPS)
+    {
+        this->muPS_ = muPS;
+        return *this;
+    }
+
+    process &process::set_alphaEW(const std::vector<double> &alphaEW)
+    {
+        this->alphaEW_ = alphaEW;
+        return *this;
+    }
+
+    process &process::set_alphaS(const std::vector<double> &alphaS)
+    {
+        this->alphaS_ = alphaS;
+        return *this;
+    }
+
+    process &process::set_momenta(const vecArr4<double> &momenta)
+    {
+        this->momenta_ = momenta;
+        return *this;
+    }
+
+    process &process::set_mass(const std::vector<double> &mass)
+    {
+        this->mass_ = mass;
+        return *this;
+    }
+
+    process &process::set_vtim(const std::vector<double> &vtim)
+    {
+        this->vtim_ = vtim;
+        return *this;
+    }
+
+    process &process::set_spin(const std::vector<double> &spin)
+    {
+        this->spin_ = spin;
+        return *this;
+    }
+
+    process &process::set_pdg(const std::vector<long int> &pdg)
+    {
+        this->pdg_ = pdg;
+        return *this;
+    }
+
+    process &process::set_status(const std::vector<short int> &status)
+    {
+        this->status_ = status;
+        return *this;
+    }
+
+    process &process::set_mother(const vecArr2<short int> &mother)
+    {
+        this->mother_ = mother;
+        return *this;
+    }
+
+    process &process::set_icol(const vecArr2<short int> &icol)
+    {
+        this->icol_ = icol;
+        return *this;
+    }
+
+    process &process::set_wgts(const std::vector<std::vector<double>> &wgts)
+    {
+        this->wgts_ = wgts;
+        return *this;
+    }
+
+    // Note: append_wgts() appends one weight to each existing wgt vector
+    // To add an additional wgt vector, access the wgts vector directly
+    // Assumes incoming weights are ordered according to the event index
+    // and appends zero weights if the event index is greater than the current number of weight vectors
+    process &process::append_wgts(const std::vector<double> &new_wgts)
+    {
+        if (this->wgts_.size() < new_wgts.size())
+        {
+            this->wgts_.resize(new_wgts.size());
+        }
+        for (size_t i = 0; i < this->wgts_.size(); ++i)
+        {
+            if (i < new_wgts.size())
+            {
+                this->wgts_[i].push_back(new_wgts[i]);
+            }
+            else
+            {
+                this->wgts_[i].push_back(0.0);
+            }
+        }
+        return *this;
+    }
+
+    process &process::add_extra(const std::string &key, const std::any &value)
+    {
+        auto it = this->extra.find(key);
+        if (it != this->extra.end())
+        {
+            it->second.push_back(value);
+        }
+        else
+        {
+            this->validate();
+            if (this->n_.size() > 1)
+                warning("process::add_extra: adding new extra field to non-empty process. This may lead to inconsistencies when transposing between the object oriented event format and the SoA process format.\nFor future reference, please ensure that all events contain the same extra fields.\nIn this instance, " + key + " will be added with empty std::any objects up until the current event index.");
+            this->extra[key] = {};
+            this->extra[key].resize(this->n_.size() - 1);
+            this->extra[key].emplace_back(value);
+        }
+        return *this;
+    }
+
+    void process::validate() const
+    {
+        auto n_size = this->n_.size();
+        if (n_size == 0)
+        {
+            if (!this->n_summed.empty() || !this->momenta_.empty() ||
+                !this->mass_.empty() || !this->vtim_.empty() ||
+                !this->spin_.empty() || !this->pdg_.empty() ||
+                !this->proc_id_.empty() || !this->weight_.empty() ||
+                !this->scale_.empty() || !this->muF_.empty() ||
+                !this->muR_.empty() || !this->muPS_.empty() ||
+                !this->alphaEW_.empty() || !this->alphaS_.empty())
+            {
+                throw std::runtime_error("process::validate() failed: n is empty but other vectors are not");
+            }
+        }
+
+        auto check_event_size = [n_size](const auto &vec, const char *name)
+        {
+            if (vec.size() != n_size)
+            {
+                std::ostringstream oss;
+                oss << "process::validate() failed: '" << name
+                    << "' has size " << vec.size() << ", expected " << n_size;
+                throw std::runtime_error(oss.str());
+            }
+        };
+
+        auto check_mu_size = [n_size](const auto &vec, const char *name)
+        {
+            if (vec.size() != n_size && !vec.empty())
+            {
+                std::ostringstream oss;
+                oss << "process::validate() failed: '" << name
+                    << "' has size " << vec.size() << ", expected " << n_size;
+                throw std::runtime_error(oss.str());
+            }
+        };
+
+        check_event_size(this->n_, "n");
+        check_event_size(this->n_summed, "n_summed");
+        check_event_size(this->proc_id_, "proc_id");
+        check_event_size(this->weight_, "weight");
+        check_event_size(this->scale_, "scale");
+        check_mu_size(this->muF_, "muF");
+        check_mu_size(this->muR_, "muR");
+        check_mu_size(this->muPS_, "muPS");
+        check_event_size(this->alphaEW_, "alphaEW");
+        check_event_size(this->alphaS_, "alphaS");
+        check_event_size(this->wgts_, "wgts");
+
+        size_t summed_n = this->n_summed.empty() ? 0 : this->n_summed.back();
+
+        size_t explicit_n_summed = std::accumulate(this->n_.begin(), this->n_.end(), 0);
+
+        if (summed_n != explicit_n_summed)
+        {
+            throw std::runtime_error("process::validate() failed: n_summed does not match sum of n");
+        }
+
+        auto check_particle_size = [summed_n](const auto &vec, const char *name)
+        {
+            if (vec.size() != summed_n)
+            {
+                std::ostringstream oss;
+                oss << "process::validate() failed: '" << name
+                    << "' has size " << vec.size() << ", expected " << summed_n;
+                throw std::runtime_error(oss.str());
+            }
+        };
+
+        check_particle_size(this->momenta_, "momenta");
+        check_particle_size(this->mass_, "mass");
+        check_particle_size(this->vtim_, "vtim");
+        check_particle_size(this->spin_, "spin");
+        check_particle_size(this->pdg_, "pdg");
+        check_particle_size(this->status_, "status");
+        check_particle_size(this->mother_, "mother");
+        check_particle_size(this->icol_, "icol");
+    }
+
+    process &process::add_extra(const std::unordered_map<std::string, std::any> &extras)
+    {
+        for (const auto &[key, value] : extras)
+        {
+            this->add_extra(key, value);
+        }
+        return *this;
+    }
+
+    void process::make_event(size_t idx)
+    {
+        if (idx >= this->n_.size())
+        {
+            throw std::out_of_range("Invalid event index");
+        }
+        std::shared_ptr<event> ev = std::make_shared<event>();
+        ev->set_n(this->n_[idx]).set_proc_id(this->proc_id_[idx]).set_weight(this->weight_[idx]).set_scale(this->scale_[idx]).set_alphaEW(this->alphaEW_[idx]).set_alphaS(this->alphaS_[idx]).set_wgts(this->wgts_[idx]);
+        if (this->muF_.size() > idx)
+            if (this->muF_[idx] > 1e-08 && std::abs(this->muF_[idx] - this->scale_[idx]) > 1e-08)
+            {
+                ev->set_muF(this->muF_[idx]);
+            }
+        if (this->muR_.size() > idx)
+            if (this->muR_[idx] > 1e-08 && std::abs(this->muR_[idx] - this->scale_[idx]) > 1e-08)
+            {
+                ev->set_muR(this->muR_[idx]);
+            }
+        if (this->muPS_.size() > idx)
+            if (this->muPS_[idx] > 1e-08 && std::abs(this->muPS_[idx] - this->scale_[idx]) > 1e-08)
+            {
+                ev->set_muPS(this->muPS_[idx]);
+            }
+        size_t begin;
+        if (idx == 0)
+        {
+            begin = 0;
+        }
+        else
+        {
+            begin = this->n_summed[idx - 1];
+        }
+        size_t end = this->n_summed[idx];
+        ev->set_momenta(this->momenta_.subvec(begin, end)).set_mass(subvector(this->mass_, begin, end)).set_vtim(subvector(this->vtim_, begin, end)).set_spin(subvector(this->spin_, begin, end)).set_pdg(subvector(this->pdg_, begin, end)).set_status(subvector(this->status_, begin, end)).set_mother(this->mother_.subvec(begin, end)).set_icol(this->icol_.subvec(begin, end));
+        for (auto &[key, values] : this->extra)
+        {
+            if (values.size() < this->n_.size())
+            {
+                values.resize(this->n_.size());
+            }
+            ev->set(key, values[idx]);
+        }
+        if (idx >= this->events.size())
+        {
+            events.resize(idx + 1);
+            events[idx] = ev;
+        }
+        else
+        {
+            events[idx] = ev;
+        }
+    }
+
+    process &process::add_extra(const std::string &key, const std::vector<std::any> &values)
+    {
+        if (values.size() < this->n_.size())
+        {
+            warning("process::add_extra() - Resizing vector for key: " + key + " to match number of events indicated by n.size().\nFor future reference, please ensure that data sizes match to avoid inconsistencies in transpositions.");
+        }
+        this->extra[key] = values;
+        this->extra[key].resize(this->n_.size());
+        return *this;
+    }
+
+    process &process::add_extra(const std::unordered_map<std::string, std::vector<std::any>> &values)
+    {
+        for (const auto &[key, val] : values)
+        {
+            this->extra[key] = val;
+            if (val.size() < this->n_.size())
+            {
+                warning("process::add_extra() - Resizing vector for key: " + key + " to match number of events indicated by n.size().\nFor future reference, please ensure that data sizes match to avoid inconsistencies in transpositions.");
+                this->extra[key].resize(this->n_.size());
+            }
+        }
+        return *this;
+    }
+
+    process &process::set_extra(const std::unordered_map<std::string, std::vector<std::any>> &values)
+    {
+        this->extra.clear();
+        this->extra.reserve(values.size());
+        for (const auto &[key, val] : values)
+        {
+            this->extra[key] = val;
+            if (val.size() < this->n_.size())
+            {
+                warning("process::set_extra() - Resizing vector for key: " + key + " to match number of events indicated by n.size().\nFor future reference, please ensure that data sizes match to avoid inconsistencies in transpositions.");
+                this->extra[key].resize(this->n_.size());
+            }
+        }
+        return *this;
+    }
+
+    process &process::set_filter(bool v)
+    {
+        this->filter = v;
+        return *this;
+    }
+
+    // Function to transpose process information into the OOP event format
+    void process::transpose()
+    {
+        this->validate();
+        this->events.clear();
+        for (size_t i = 0; i < this->n_.size(); ++i)
+        {
+            this->make_event(i);
+        }
+    }
+
+    // Functions for partial transpositions
+    process &process::transpose_n()
+    {
+        this->validate();
+        if (this->n_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_n() - Number of events does not match number of n values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            this->events[i]->set_n(this->n_[i]);
+        }
+        return *this;
+    }
+    process &process::transpose_nUP() { return this->transpose_n(); }
+
+    process &process::transpose_proc_id()
+    {
+        this->validate();
+        if (this->proc_id_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_proc_id() - Number of events does not match number of proc_id values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            this->events[i]->set_proc_id(this->proc_id_[i]);
+        }
+        return *this;
+    }
+    process &process::transpose_idPrUP() { return this->transpose_proc_id(); }
+    process &process::transpose_idPr() { return this->transpose_proc_id(); }
+
+    process &process::transpose_weight()
+    {
+        this->validate();
+        if (this->weight_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_weight() - Number of events does not match number of weight values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            this->events[i]->set_weight(this->weight_[i]);
+        }
+        return *this;
+    }
+    process &process::transpose_xWgtUP() { return this->transpose_weight(); }
+    process &process::transpose_xWgt() { return this->transpose_weight(); }
+
+    process &process::transpose_scale()
+    {
+        this->validate();
+        if (this->scale_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_scale() - Number of events does not match number of scale values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            this->events[i]->set_scale(this->scale_[i]);
+        }
+        return *this;
+    }
+    process &process::transpose_scalUP() { return this->transpose_scale(); }
+
+    process &process::transpose_muF()
+    {
+        this->validate();
+        if (this->muF_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_muF() - Number of events does not match number of muF values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            if (this->muF_[i] > 1e-08 && std::abs(this->muF_[i] - this->scale_[i]) > 1e-08)
+            {
+                this->events[i]->set_muF(this->muF_[i]);
+            }
+            else
+            {
+                this->events[i]->set_muF(0.0);
+            }
+        }
+        return *this;
+    }
+
+    process &process::transpose_muR()
+    {
+        this->validate();
+        if (this->muR_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_muR() - Number of events does not match number of muR values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            if (this->muR_[i] > 1e-08 && std::abs(this->muR_[i] - this->scale_[i]) > 1e-08)
+            {
+                this->events[i]->set_muR(this->muR_[i]);
+            }
+            else
+            {
+                this->events[i]->set_muR(0.0);
+            }
+        }
+        return *this;
+    }
+
+    process &process::transpose_muPS()
+    {
+        this->validate();
+        if (this->muPS_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_muPS() - Number of events does not match number of muPS values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            if (this->muPS_[i] > 1e-08 && std::abs(this->muPS_[i] - this->scale_[i]) > 1e-08)
+            {
+                this->events[i]->set_muPS(this->muPS_[i]);
+            }
+            else
+            {
+                this->events[i]->set_muPS(0.0);
+            }
+        }
+        return *this;
+    }
+
+    process &process::transpose_alphaEW()
+    {
+        this->validate();
+        if (this->alphaEW_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_alphaEW() - Number of events does not match number of alphaEW values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            if (this->alphaEW_[i] > 1e-08 && std::abs(this->alphaEW_[i] - this->scale_[i]) > 1e-08)
+            {
+                this->events[i]->set_alphaEW(this->alphaEW_[i]);
+            }
+            else
+            {
+                this->events[i]->set_alphaEW(0.0);
+            }
+        }
+        return *this;
+    }
+    process &process::transpose_aQEDUP() { return this->transpose_alphaEW(); }
+    process &process::transpose_aQED() { return this->transpose_alphaEW(); }
+
+    process &process::transpose_alphaS()
+    {
+        this->validate();
+        if (this->alphaS_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_alphaS() - Number of events does not match number of alphaS values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            if (this->alphaS_[i] > 1e-08 && std::abs(this->alphaS_[i] - this->scale_[i]) > 1e-08)
+            {
+                this->events[i]->set_alphaS(this->alphaS_[i]);
+            }
+            else
+            {
+                this->events[i]->set_alphaS(0.0);
+            }
+        }
+        return *this;
+    }
+    process &process::transpose_aQCDUP() { return this->transpose_alphaS(); }
+    process &process::transpose_aQCD() { return this->transpose_alphaS(); }
+
+    process &process::transpose_momenta()
+    {
+        this->validate();
+        if (this->momenta_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_momenta() - Number of events does not match number of momenta values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_momenta(this->momenta_.subvec(begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_pUP() { return this->transpose_momenta(); }
+    process &process::transpose_mom() { return this->transpose_momenta(); }
+    process &process::transpose_p() { return this->transpose_momenta(); }
+
+    process &process::transpose_mass()
+    {
+        this->validate();
+        if (this->mass_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_mass() - Number of events does not match number of mass values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_mass(subvector(this->mass_, begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_mUP() { return this->transpose_mass(); }
+    process &process::transpose_m() { return this->transpose_mass(); }
+
+    process &process::transpose_vtim()
+    {
+        this->validate();
+        if (this->vtim_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_vtim() - Number of events does not match number of vtim values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_vtim(subvector(this->vtim_, begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_vTimUP() { return this->transpose_vtim(); }
+
+    process &process::transpose_spin()
+    {
+        this->validate();
+        if (this->spin_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_spin() - Number of events does not match number of spin values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->n_summed.back(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_spin(subvector(this->spin_, begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_spinUP() { return this->transpose_spin(); }
+
+    process &process::transpose_pdg()
+    {
+        this->validate();
+        if (this->pdg_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_pdg() - Number of events does not match number of pdg values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_pdg(subvector(this->pdg_, begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_idUP() { return this->transpose_pdg(); }
+    process &process::transpose_id() { return this->transpose_pdg(); }
+
+    process &process::transpose_status()
+    {
+        this->validate();
+        if (this->status_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_status() - Number of events does not match number of status values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_status(subvector(this->status_, begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_iStUP() { return this->transpose_status(); }
+    process &process::transpose_iSt() { return this->transpose_status(); }
+
+    process &process::transpose_mother()
+    {
+        this->validate();
+        if (this->mother_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_mother() - Number of events does not match number of mother values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_mother(this->mother_.subvec(begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_mothUP() { return this->transpose_mother(); }
+
+    process &process::transpose_icol()
+    {
+        this->validate();
+        if (this->icol_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_icol() - Number of events does not match number of icol values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            this->events[i]->set_icol(this->icol_.subvec(begin, end));
+        }
+        return *this;
+    }
+    process &process::transpose_iColUP() { return this->transpose_icol(); }
+
+    process &process::transpose_wgts()
+    {
+        this->validate();
+        if (this->wgts_.size() != this->events.size())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_wgts() - Number of events does not match number of wgts values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            this->events[i]->set_wgts(this->wgts_[i]);
+        }
+        return *this;
+    }
+
+    process &process::transpose_extra()
+    {
+        this->validate();
+        if (this->extra.empty())
+        {
+            warning("process::transpose_extra() - No extra fields to transpose.");
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            for (const auto &[key, values] : this->extra)
+            {
+                if (values.size() <= i)
+                {
+                    warning("process::transpose_extra() - Not enough values for key: " + key + ". Resizing to match number of events.");
+                    this->extra[key].resize(this->events.size());
+                }
+                this->events[i]->set(key, values[i]);
+            }
+        }
+        return *this;
+    }
+
+    process &process::transpose_E()
+    {
+        this->validate();
+        if (this->momenta_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_E() - Number of events does not match number of E values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            auto full_mom = this->momenta_.subvec(begin, end);
+            if (full_mom.size() != this->events.size())
+            {
+                throw std::runtime_error("process::transpose_E() - Size mismatch between momenta and events.");
+            }
+            for (size_t j = 0; j < full_mom.size(); ++j)
+            {
+                this->events[i]->at(j).E() = full_mom[j][0];
+            }
+        }
+        return *this;
+    }
+    process &process::transpose_t() { return this->transpose_E(); }
+
+    process &process::transpose_x()
+    {
+        this->validate();
+        if (this->momenta_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_x() - Number of events does not match number of x values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            auto full_mom = this->momenta_.subvec(begin, end);
+            if (full_mom.size() != this->events.size())
+            {
+                throw std::runtime_error("process::transpose_x() - Size mismatch between momenta and events.");
+            }
+            for (size_t j = 0; j < full_mom.size(); ++j)
+            {
+                this->events[i]->at(j).px() = full_mom[j][1];
+            }
+        }
+        return *this;
+    }
+    process &process::transpose_px() { return this->transpose_x(); }
+
+    process &process::transpose_y()
+    {
+        this->validate();
+        if (this->momenta_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_y() - Number of events does not match number of y values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            auto full_mom = this->momenta_.subvec(begin, end);
+            if (full_mom.size() != this->events.size())
+            {
+                throw std::runtime_error("process::transpose_y() - Size mismatch between momenta and events.");
+            }
+            for (size_t j = 0; j < full_mom.size(); ++j)
+            {
+                this->events[i]->at(j).py() = full_mom[j][2];
+            }
+        }
+        return *this;
+    }
+    process &process::transpose_py() { return this->transpose_y(); }
+
+    process &process::transpose_z()
+    {
+        this->validate();
+        if (this->momenta_.size() != this->n_summed.back())
+        {
+            if (!this->events.empty())
+                warning("process::transpose_z() - Number of events does not match number of z values. Overwriting process::events vector.");
+            this->transpose();
+            return *this;
+        }
+        for (size_t i = 0; i < this->events.size(); ++i)
+        {
+            size_t begin;
+            if (i == 0)
+            {
+                begin = 0;
+            }
+            else
+            {
+                begin = this->n_summed[i - 1];
+            }
+            size_t end = this->n_summed[i];
+            auto full_mom = this->momenta_.subvec(begin, end);
+            if (full_mom.size() != this->n_summed.back())
+            {
+                throw std::runtime_error("process::transpose_z() - Size mismatch between momenta and events.");
+            }
+            for (size_t j = 0; j < full_mom.size(); ++j)
+            {
+                this->events[i]->at(j).pz() = full_mom[j][3];
+            }
+        }
+        return *this;
+    }
+    process &process::transpose_pz() { return this->transpose_z(); }
+
+    initNode::initNode(short unsigned int nproc)
+    {
+        this->nProc_ = nproc;
+        this->xSec_.resize(nproc, 0.0);
+        this->xSecErr_.resize(nproc, 0.0);
+        this->xMax_.resize(nproc, 0.0);
+        this->lProc_.resize(nproc, 0);
+    }
+
+    initNode::initNode(size_t nproc)
+    {
+        // Check that nproc fits in a short
+        if (nproc > std::numeric_limits<short unsigned int>::max())
+        {
+            throw std::invalid_argument("initNode::initNode() - nproc is too large");
+        }
+        this->nProc_ = static_cast<short unsigned int>(nproc);
+        this->xSec_.resize(nproc, 0.0);
+        this->xSecErr_.resize(nproc, 0.0);
+        this->xMax_.resize(nproc, 0.0);
+        this->lProc_.resize(nproc, 0);
+    }
+
+    void initNode::validate_init() const
+    {
+        if (this->nProc_ != this->xSec_.size())
+        {
+            throw std::runtime_error("initNode::validate_init() failed: nProc does not match size of xSec");
+        }
+        if (this->nProc_ != this->xSecErr_.size())
+        {
+            throw std::runtime_error("initNode::validate_init() failed: nProc does not match size of xSecErr");
+        }
+        if (this->nProc_ != this->xMax_.size())
+        {
+            throw std::runtime_error("initNode::validate_init() failed: nProc does not match size of xMax");
+        }
+        if (this->nProc_ != this->lProc_.size())
+        {
+            throw std::runtime_error("initNode::validate_init() failed: nProc does not match size of lProc");
+        }
+    }
+
+    void initNode::print_head(std::ostream &os) const
+    {
+        os << std::scientific << std::setprecision(6)
+           << this->idBm_[0] << " " << this->idBm_[1]
+           << " " << this->eBm_[0] << " " << this->eBm_[1]
+           << " " << this->pdfG_[0] << " " << this->pdfG_[1]
+           << " " << this->pdfS_[0] << " " << this->pdfS_[1]
+           << " " << this->idWgt_
+           << " " << this->nProc_
+           << "\n";
+    }
+
+    void initNode::print_body(std::ostream &os) const
+    {
+        os << std::scientific << std::setprecision(6);
+        for (size_t i = 0; i < this->xSec_.size(); ++i)
+        {
+            os << this->xSec_[i] << " " << this->xSecErr_[i]
+               << " " << this->xMax_[i] << " " << this->lProc_[i]
+               << "\n";
+        }
+    }
+
+    void initNode::print_extra(std::ostream &os) const
+    {
+        for (const auto &[key, val] : this->extra)
+        {
+            if (val.type() == typeid(int))
+            {
+                os << "<" << key << "\">" << std::any_cast<int>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(double))
+            {
+                os << std::setprecision(10) << std::scientific;
+                os << "<" << key << "\">" << std::any_cast<double>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(std::string))
+            {
+                os << "<" << key << "\">" << std::any_cast<std::string>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(std::string_view))
+            {
+                os << "<" << key << "\">" << std::any_cast<std::string_view>(val) << "</" << key << ">\n";
+            }
+            else if (val.type() == typeid(xmlNode))
+            {
+                std::any_cast<xmlNode>(val).write(os);
+                os << "\n";
+            }
+            else if (val.type() == typeid(std::shared_ptr<xmlNode>))
+            {
+                std::any_cast<std::shared_ptr<xmlNode>>(val)->write(os);
+                os << "\n";
+            }
+            else
+            {
+                warning("Unknown type for extra field: " + key + ", skipping print");
+            }
+        }
+    }
+
+    void initNode::print_init(std::ostream &os) const
+    {
+        this->validate_init();
+        os << "\n<init>\n";
+        this->print_head(os);
+        this->print_body(os);
+        this->print_extra(os);
+        os << "</init>";
+    }
+
+    lhe::lhe(std::vector<std::shared_ptr<event>> evs)
+    {
+        this->events = evs;
+        std::vector<long int> proc_ids = {};
+        this->sync_weight_ids();
+        for (auto ev : evs)
+        {
+            if (std::find(proc_ids.begin(), proc_ids.end(), ev->proc_id_) == proc_ids.end())
+            {
+                proc_ids.push_back(ev->proc_id_);
+            }
+        }
+        this->nProc_ = proc_ids.size();
+        std::sort(proc_ids.begin(), proc_ids.end());
+        this->lProc_ = proc_ids;
+    }
+
+    lhe::lhe(std::vector<event> evs)
+    {
+        this->events.reserve(evs.size());
+        for (const auto &ev : evs)
+        {
+            this->events.push_back(std::make_shared<event>(ev));
+        }
+        this->sync_weight_ids();
+        std::vector<long int> proc_ids = {};
+        for (auto ev : this->events)
+        {
+            if (std::find(proc_ids.begin(), proc_ids.end(), ev->proc_id_) == proc_ids.end())
+            {
+                proc_ids.push_back(ev->proc_id_);
+            }
+        }
+        this->nProc_ = proc_ids.size();
+        std::sort(proc_ids.begin(), proc_ids.end());
+        this->lProc_ = proc_ids;
+    }
+
+    lhe::lhe(const initNode &i, std::vector<std::shared_ptr<event>> evts) : initNode(i), events(std::move(evts))
+    {
+        std::vector<long int> proc_ids = {};
+        this->sync_weight_ids();
+        for (auto ev : this->events)
+        {
+            if (std::find(proc_ids.begin(), proc_ids.end(), ev->proc_id_) == proc_ids.end())
+            {
+                proc_ids.push_back(ev->proc_id_);
+            }
+        }
+        this->nProc_ = proc_ids.size();
+        std::sort(proc_ids.begin(), proc_ids.end());
+        this->lProc_ = proc_ids;
+    }
+
+    lhe::lhe(const initNode &i, std::vector<event> evts) : initNode(i)
+    {
+        this->events.reserve(evts.size());
+        for (const auto &ev : evts)
+        {
+            this->events.push_back(std::make_shared<event>(ev));
+        }
+        this->sync_weight_ids();
+        std::vector<long int> proc_ids = {};
+        for (auto ev : this->events)
+        {
+            if (std::find(proc_ids.begin(), proc_ids.end(), ev->proc_id_) == proc_ids.end())
+            {
+                proc_ids.push_back(ev->proc_id_);
+            }
+        }
+        this->nProc_ = proc_ids.size();
+        std::sort(proc_ids.begin(), proc_ids.end());
+        this->lProc_ = proc_ids;
+    }
+
+    lhe &lhe::set_events(std::vector<std::shared_ptr<event>> evs)
+    {
+        this->events = evs;
+        this->sync_weight_ids();
+        return *this;
+    }
+
+    lhe &lhe::set_processes(std::vector<std::shared_ptr<process>> procs)
+    {
+        this->processes = procs;
+        return *this;
+    }
+
+    lhe &lhe::set_header(std::any hdr)
+    {
+        this->header = std::move(hdr);
+        return *this;
+    }
+
+    lhe &lhe::set_weight_ids(const std::vector<std::string> &ids)
+    {
+        *this->weight_ids = ids;
+        return *this;
+    }
+
+    lhe &lhe::set_weight_ids(std::vector<std::string> &&ids)
+    {
+        *this->weight_ids = std::move(ids);
+        return *this;
+    }
+
+    lhe &lhe::set_weight_ids(std::shared_ptr<std::vector<std::string>> ids)
+    {
+        this->weight_ids = std::move(ids);
+        return *this;
+    }
+
+    lhe &lhe::add_weight_id(const std::string &id)
+    {
+        this->weight_ids->push_back(id);
+        return *this;
+    }
+
+    lhe &lhe::add_weight_id(std::string &&id)
+    {
+        this->weight_ids->push_back(std::move(id));
+        return *this;
+    }
+
+    void lhe::extract_weight_ids()
+    {
+        if (!this->weight_ids)
+        {
+            this->weight_ids = std::make_shared<std::vector<std::string>>();
+        }
+        if (this->header.type() != typeid(std::shared_ptr<xmlNode>))
+        {
+            throw std::runtime_error("lhe::extract_weight_ids() - Header is not of type std::shared_ptr<xmlNode>");
+        }
+        auto xml_header = std::any_cast<std::shared_ptr<xmlNode>>(this->header);
+        auto id_puller = [&](std::shared_ptr<xmlNode> node)
+        {
+            auto atrs = node->attrs();
+            for (auto atr : atrs)
+            {
+                if (atr.name() == "id")
+                    this->weight_ids->push_back(std::string(atr.value()));
+            }
+        };
+        // Extract weight IDs from the XML header
+        auto initrwgt = xml_header->get_child("initrwgt");
+        if (initrwgt)
+        {
+            for (auto child : initrwgt->children())
+            {
+                if (child->name() == "weight")
+                {
+                    id_puller(child);
+                }
+                else if (child->name() == "weightgroup")
+                {
+                    for (auto grandchild : child->children())
+                    {
+                        if (grandchild->name() == "weight")
+                        {
+                            id_puller(grandchild);
+                        }
+                    }
+                }
+            }
+        }
+        for (auto ev : this->events)
+        {
+            ev->weight_ids = this->weight_ids; // overwrites weight_ids if original set has several different ones
+        }
+    }
+
+    void lhe::sync_weight_ids()
+    {
+        if (!this->weight_ids)
+        {
+            this->weight_ids = std::make_shared<std::vector<std::string>>();
+        }
+        for (auto ev : this->events)
+        {
+            if (!ev)
+                continue;
+            if (ev->weight_ids)
+            { // If event has weight_ids, check if it's larger than current
+                if (ev->weight_ids->size() > this->weight_ids->size())
+                    this->weight_ids = ev->weight_ids;
+            }
+        }
+        for (auto ev : this->events)
+        {
+            ev->weight_ids = this->weight_ids; // overwrites weight_ids if original set has several different ones
+        }
+    }
+
+    initNode &initNode::set_idBm(const arr2<long int> &idBm)
+    {
+        this->idBm_ = idBm;
+        return *this;
+    }
+
+    initNode &initNode::set_idBm(long int id1, long int id2)
+    {
+        this->idBm_ = {id1, id2};
+        return *this;
+    }
+
+    initNode &initNode::set_eBm(const arr2<double> &energies)
+    {
+        this->eBm_ = energies;
+        return *this;
+    }
+
+    initNode &initNode::set_eBm(double e1, double e2)
+    {
+        this->eBm_ = {e1, e2};
+        return *this;
+    }
+
+    initNode &initNode::set_pdfG(const arr2<short int> &pdfG)
+    {
+        this->pdfG_ = pdfG;
+        return *this;
+    }
+
+    initNode &initNode::set_pdfG(short int pdf1, short int pdf2)
+    {
+        this->pdfG_ = {pdf1, pdf2};
+        return *this;
+    }
+
+    initNode &initNode::set_pdfS(const arr2<long int> &pdfS)
+    {
+        this->pdfS_ = pdfS;
+        return *this;
+    }
+
+    initNode &initNode::set_pdfS(long int pdf1, long int pdf2)
+    {
+        this->pdfS_ = {pdf1, pdf2};
+        return *this;
+    }
+
+    initNode &initNode::set_idWgt(short int id)
+    {
+        this->idWgt_ = id;
+        return *this;
+    }
+
+    initNode &initNode::set_nProc(short unsigned int n)
+    {
+        this->nProc_ = n;
+        return *this;
+    }
+
+    initNode &initNode::set_xSec(const std::vector<double> &xSec)
+    {
+        this->xSec_ = xSec;
+        return *this;
+    }
+
+    initNode &initNode::set_xSecErr(const std::vector<double> &xSecErr)
+    {
+        this->xSecErr_ = xSecErr;
+        return *this;
+    }
+    initNode &initNode::set_xMax(const std::vector<double> &xMax)
+    {
+        this->xMax_ = xMax;
+        return *this;
+    }
+
+    initNode &initNode::set_lProc(const std::vector<long int> &lProc)
+    {
+        this->lProc_ = lProc;
+        return *this;
+    }
+
+    initNode &initNode::add_xSec(double xsec)
+    {
+        this->xSec_.push_back(xsec);
+        return *this;
+    }
+
+    initNode &initNode::add_xSecErr(double xsec_err)
+    {
+        this->xSecErr_.push_back(xsec_err);
+        return *this;
+    }
+
+    initNode &initNode::add_xMax(double xmax)
+    {
+        this->xMax_.push_back(xmax);
+        return *this;
+    }
+
+    initNode &initNode::add_lProc(long int lproc)
+    {
+        this->lProc_.push_back(lproc);
+        return *this;
+    }
+
+    lhe &lhe::add_event(std::shared_ptr<event> ev)
+    {
+        this->events.push_back(ev);
+        return *this;
+    }
+
+    lhe &lhe::add_event(const event &ev)
+    {
+        this->events.push_back(std::make_shared<event>(ev));
+        return *this;
+    }
+
+    // If no sorter is provided, sort by external partons
+    lhe &lhe::set_sorter()
+    {
+        if (this->events.empty())
+        {
+            throw std::runtime_error("lhe::set_sorter() called with no events");
+        }
+        this->sorter = make_sample_sorter(this->events);
+        this->event_hash = this->sorter.get_hash();
+        return *this;
+    }
+
+    lhe &lhe::set_sorter(event_equal_fn comp)
+    {
+        if (this->events.empty())
+        {
+            throw std::runtime_error("lhe::set_sorter() called with no events");
+        }
+        this->sorter = make_sample_sorter(this->events, comp);
+        this->event_hash = this->sorter.get_hash();
+        return *this;
+    }
+
+    lhe &lhe::set_sorter(cevent_equal_fn comp)
+    {
+        if (this->events.empty())
+        {
+            throw std::runtime_error("lhe::set_sorter() called with no events");
+        }
+        this->sorter = make_sample_sorter(this->events, comp);
+        this->event_hash = this->sorter.get_hash();
+        return *this;
+    }
+
+    lhe &lhe::set_sorter(const eventSorter &sort)
+    {
+        this->sorter = sort;
+        this->event_hash = this->sorter.get_hash();
+        return *this;
+    }
+
+    void lhe::extract_hash()
+    {
+        if (!this->sorter.size())
+            this->set_sorter();
+        this->event_hash = this->sorter.get_hash();
+    }
+
+    lhe &lhe::set_hash(event_hash_fn hash)
+    {
+        this->event_hash = hash;
+        return *this;
+    }
+
+    lhe &lhe::set_filter(bool filter)
+    {
+        this->filter_processes = filter;
+        return *this;
+    }
+
+    void lhe::sort_events()
+    {
+        if (this->events.empty())
+        {
+            throw std::runtime_error("lhe::sort_events() called with no events");
+        }
+        if (this->sorter.size() == 0 && !this->event_hash)
+        {
+            this->set_sorter();
+        }
+        this->process_order.clear();
+        this->process_order.reserve(this->events.size());
+        for (auto ev : this->events)
+        {
+            this->process_order.push_back(this->event_hash(*ev));
+        }
+        if (this->process_order.size() != this->events.size())
+        {
+            throw std::runtime_error("lhe::sort_events() failed: process_order size does not match events size");
+        }
+        this->sorted_events.clear();
+        for (size_t j = 0; j < this->sorter.size(); ++j)
+        {
+            this->sorted_events.push_back({});
+        }
+        this->sorted_events.push_back({});
+        for (size_t ind = 0; ind < this->events.size(); ++ind)
+        {
+            size_t sort_ind = this->process_order[ind];
+            if (sort_ind == npos)
+                sort_ind = this->sorted_events.size() - 1; // If an event does not belong to any set in the sorter, returns npos; compensate by adding an additional "unsorted" vector at the end
+            this->sorted_events[sort_ind].push_back(this->events[ind]);
+        }
+        // If all events were successfully sorted, remove the empty "unsorted" vector
+        if (this->sorted_events.back().empty())
+        {
+            this->sorted_events.pop_back(); // Remove empty last set
+        }
+    }
+
+    void lhe::unsort_events()
+    {
+        if (this->sorted_events.empty())
+        {
+            throw std::runtime_error("lhe::unsort_events() called with no sorted events");
+        }
+        this->events.clear();
+        this->events.reserve(this->process_order.size());
+        std::vector<size_t> unorder;
+        unorder.resize(this->sorted_events.size());
+        for (size_t j = 0; j < this->process_order.size(); ++j)
+        {
+            auto curr_idx = this->process_order[j];
+            if (curr_idx == npos)
+                curr_idx = unorder.size() - 1;
+            auto srt_idx = unorder[curr_idx];
+            if (srt_idx >= this->sorted_events[curr_idx].size())
+                throw std::runtime_error("lhe::unsort_events() failed: sorted event index out of bounds");
+            this->events.push_back(this->sorted_events[curr_idx][srt_idx]);
+            unorder[curr_idx]++;
+        }
+    }
+
+    void lhe::events_to_processes()
+    {
+        if (this->events.empty())
+        {
+            throw std::runtime_error("lhe::events_to_processes() called with no events");
+        }
+        this->sort_events(); // Overrides previous sorting (if one exists)
+        this->processes.clear();
+        for (auto ev_set : this->sorted_events)
+        {
+            this->processes.push_back(std::make_shared<process>(ev_set, this->filter_processes));
+        }
+    }
+
+    void lhe::processes_to_events()
+    {
+        if (this->processes.empty())
+        {
+            throw std::runtime_error("lhe::processes_to_events() called with no processes");
+        }
+        this->sorted_events.clear();
+        for (auto &proc : this->processes)
+        {
+            proc->transpose();
+            this->sorted_events.push_back(proc->events);
+        }
+        this->unsort_events();
+    }
+
+    void lhe::transpose()
+    {
+        if (this->events.empty() && this->processes.empty())
+        {
+            throw std::runtime_error("lhe::transpose() called with no events or processes");
+        }
+        if (!this->events.empty())
+        {
+            // Transpose events to processes
+            this->events_to_processes();
+        }
+        else if (!this->processes.empty())
+        {
+            // Transpose processes to events
+            this->processes_to_events();
+        }
+        else
+        {
+            warning("lhe::transpose() called with both events and processes filled, cannot deduce which transposition to run.\nCall with string argument \"events\" to transpose from events to processes or with string argument \"processes\" to transpose from processes to events.\nAlternatively, call the lhe::events_to_processes() or lhe::processes_to_events() functions directly.");
+        }
+    }
+
+    void lhe::transpose(std::string dir)
+    {
+        if (dir == "events" || dir == "event")
+        {
+            this->events_to_processes();
+        }
+        else if (dir == "processes" || dir == "process")
+        {
+            this->processes_to_events();
+        }
+        else
+        {
+            warning("lhe::transpose() called with invalid direction: " + dir + "\nCall with string argument \"events\" to transpose from events to processes or with string argument \"processes\" to transpose from processes to events.");
+        }
+    }
+
+    void lhe::append_weight_ids(bool include)
+    {
+        if (this->weight_ids->empty())
+        {
+            return;
+        }
+        if (this->header.type() != typeid(REX::xmlNode) && this->header.type() != typeid(std::shared_ptr<REX::xmlNode>))
+        {
+            return;
+        }
+        std::string wgtgr_str = "\n<weightgroup name=\'tearex_reweighting\'";
+        if (include)
+            wgtgr_str += " weight_name_strategy=\'includeIdInWeightName\'>\n";
+        for (size_t i = 0; i < this->weight_ids->size(); i++)
+        {
+            wgtgr_str += "<weight id=\'" + this->weight_ids->at(i) + "\'>";
+            if (i < this->weight_context.size())
+            {
+                wgtgr_str += this->weight_context[i];
+            }
+            wgtgr_str += "</weight>\n";
+        }
+        wgtgr_str += "</weightgroup>\n";
+        auto wgt_node = xmlNode::parse(wgtgr_str);
+        std::shared_ptr<xmlNode> initrwgt;
+        if (this->header.type() == typeid(REX::xmlNode))
+        {
+            if (std::any_cast<REX::xmlNode>(this->header).get_child("initrwgt"))
+            {
+                initrwgt = std::any_cast<REX::xmlNode>(this->header).get_child("initrwgt");
+            }
+            else
+            {
+                initrwgt = xmlNode::parse("<initrwgt>\n</initrwgt>\n");
+            }
+        }
+        else if (this->header.type() == typeid(std::shared_ptr<REX::xmlNode>))
+        {
+            if (std::any_cast<std::shared_ptr<REX::xmlNode>>(this->header)->get_child("initrwgt"))
+            {
+                initrwgt = std::any_cast<std::shared_ptr<REX::xmlNode>>(this->header)->get_child("initrwgt");
+            }
+            else
+            {
+                initrwgt = xmlNode::parse("<initrwgt>\n</initrwgt>\n");
+            }
+        }
+        else
+        {
+            initrwgt = xmlNode::parse("<initrwgt>\n</initrwgt>\n");
+        }
+        initrwgt->add_child(wgt_node, true);
+        if (this->header.type() == typeid(REX::xmlNode))
+        {
+            this->header = std::any_cast<REX::xmlNode>(this->header).replace_child("initrwgt", initrwgt, true);
+        }
+        else if (this->header.type() == typeid(std::shared_ptr<REX::xmlNode>))
+        {
+            std::any_cast<std::shared_ptr<REX::xmlNode>>(this->header)->replace_child("initrwgt", initrwgt, true);
+        }
+    }
+
+    void lhe::print_header(std::ostream &os) const
+    {
+        if (this->header.type() == typeid(std::string))
+        {
+            os << std::any_cast<std::string>(this->header);
+        }
+        else if (this->header.type() == typeid(std::shared_ptr<xmlNode>))
+        {
+            std::any_cast<std::shared_ptr<xmlNode>>(this->header)->write(os);
+        }
+        else if (this->header.type() == typeid(xmlNode))
+        {
+            std::any_cast<xmlNode>(this->header).write(os);
+        }
+        else if (this->header.has_value())
+        {
+            warning("lhe::print_header() - Header is of unknown type, cannot print.");
+        }
+    }
+
+    void lhe::print(std::ostream &os, bool include_ids)
+    {
+        this->append_weight_ids(include_ids);
+        os << "<LesHouchesEvents version=\"3.0\">\n";
+        this->print_header(os);
+        this->print_init(os);
+        for (auto event : this->events)
+        {
+            event->print(os, include_ids);
+        }
+        os << "\n</LesHouchesEvents>";
+    }
+
+    // -------------------- xmlDoc --------------------
+
+    xmlDoc::xmlDoc(std::string xml)
+        : buf_(std::make_shared<std::string>(std::move(xml))) {}
+
+    const std::string &xmlDoc::str() const noexcept { return *buf_; }
+    std::string_view xmlDoc::view() const noexcept { return std::string_view(*buf_); }
+    std::shared_ptr<const std::string> xmlDoc::shared() const noexcept { return buf_; }
+
+    // -------------------- Attr --------------------
+
+    std::string_view Attr::name() const noexcept { return name_new ? std::string_view(*name_new) : name_view; }
+    std::string_view Attr::value() const noexcept { return value_new ? std::string_view(*value_new) : value_view; }
+    bool Attr::modified() const noexcept { return name_new.has_value() || value_new.has_value(); }
+
+    // -------------------- xmlNode (public) --------------------
+
+    xmlNode::xmlNode() = default;
+    xmlNode::~xmlNode() = default;
+
+    // xmlNode::parse(std::string)
+    std::shared_ptr<xmlNode> xmlNode::parse(std::string xml)
+    {
+        xmlDoc doc{std::move(xml)};
+        auto shared = doc.shared();
+
+        // Keep the original first '<' offset
+        size_t first_start = find_first_element_start(*shared, 0);
+        size_t pos = first_start; // pass a copy; parse_element will advance this
+
+        auto root = parse_element(shared, pos);
+        if (root)
+        {
+            root->prolog_start_ = 0;
+            root->prolog_end_ = first_start; // **not** the advanced pos
+        }
+        return root;
+    }
+
+    // xmlNode::parse(const std::shared_ptr<const std::string>&)
+    std::shared_ptr<xmlNode> xmlNode::parse(const std::shared_ptr<const std::string> &buf)
+    {
+        size_t first_start = find_first_element_start(*buf, 0);
+        size_t pos = first_start;
+
+        auto root = parse_element(buf, pos);
+        if (root)
+        {
+            root->prolog_start_ = 0;
+            root->prolog_end_ = first_start; // **not** the advanced pos
+        }
+        return root;
+    }
+
+    std::string_view xmlNode::name() const noexcept
+    {
+        return name_new_ ? std::string_view(*name_new_) : name_view_;
+    }
+
+    std::string_view xmlNode::full() const noexcept
+    {
+        if (!doc_ || start_ == npos || end_ == npos)
+            return {};
+        return std::string_view(doc_->data() + start_, end_ - start_);
+    }
+
+    std::string_view xmlNode::content() const noexcept
+    {
+        if (!doc_ || content_start_ == npos || content_end_ == npos)
+            return {};
+        return std::string_view(doc_->data() + content_start_, content_end_ - content_start_);
+    }
+
+    const std::vector<Attr> &xmlNode::attrs() const noexcept { return attrs_; }
+    std::vector<std::shared_ptr<xmlNode>> &xmlNode::children() { return children_; }
+
+    bool xmlNode::modified(bool deep) const noexcept
+    {
+        if (modified_)
+            return true;
+        if (!deep)
+            return false;
+        for (auto &c : children_)
+            if (c->modified(true))
+                return true;
+        return false;
+    }
+
+    bool xmlNode::is_leaf() const noexcept { return children_.empty(); }
+
+    void xmlNode::set_name(std::string new_name)
+    {
+        name_new_ = std::move(new_name);
+        modified_ = true;
+    }
+
+    void xmlNode::set_content(std::string new_content)
+    {
+        content_writer_ = {}; // prefer explicit content over writer
+        content_new_ = std::move(new_content);
+        modified_ = true;
+    }
+
+    void xmlNode::set_content_writer(std::function<void(std::ostream &)> writer)
+    {
+        content_new_.reset();
+        content_writer_ = std::move(writer);
+        modified_ = true;
+    }
+
+    bool xmlNode::set_attr(std::string_view key, std::string new_value)
+    {
+        for (auto &a : attrs_)
+        {
+            if (a.name() == key)
+            {
+                a.value_new = std::move(new_value);
+                modified_ = true;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void xmlNode::add_attr(std::string name, std::string value)
+    {
+        Attr a;
+        a.name_new = std::move(name); // brand-new, so these live in COW fields
+        a.value_new = std::move(value);
+        attrs_.push_back(std::move(a));
+        modified_ = true;
+    }
+
+    void xmlNode::add_child(std::shared_ptr<xmlNode> child, bool add_nl)
+    {
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        // Default placement: end of this node's content (before end tag)
+        inserts_.push_back(InsertHint{InsertHint::Where::AtEnd, nullptr, 0, raw});
+        self_closing_ = false;
+        modified_ = true;
+        children_.back()->append_nl_start = add_nl;
+        append_nl_end = add_nl;
+    }
+
+    bool xmlNode::insert_child_before(size_t anchor_in_doc_ordinal, std::shared_ptr<xmlNode> child) noexcept
+    {
+        const xmlNode *anchor = nth_in_doc_child(anchor_in_doc_ordinal);
+        if (!anchor)
+            return false;
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::Before, anchor, 0, raw});
+        modified_ = true;
+        return true;
+    }
+    bool xmlNode::insert_child_after(size_t anchor_in_doc_ordinal, std::shared_ptr<xmlNode> child) noexcept
+    {
+        const xmlNode *anchor = nth_in_doc_child(anchor_in_doc_ordinal);
+        if (!anchor)
+            return false;
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::After, anchor, 0, raw});
+        modified_ = true;
+        return true;
+    }
+    bool xmlNode::replace_child(size_t anchor_in_doc_ordinal, std::shared_ptr<xmlNode> child) noexcept
+    {
+        const xmlNode *anchor = nth_in_doc_child(anchor_in_doc_ordinal);
+        if (!anchor)
+            return false;
+        // suppress the anchor
+        remove_child(anchor);
+        // insert new before where the old one was
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::Before, anchor, 0, raw});
+        modified_ = true;
+        return true;
+    }
+    bool xmlNode::replace_child(std::string_view anchor_name, std::shared_ptr<xmlNode> child, bool add_nl) noexcept
+    {
+        size_t anchor_index = npos;
+        for (size_t i = 0; i < children_.size(); ++i)
+        {
+            if (children_[i] && children_[i]->name() == anchor_name)
+            {
+                anchor_index = i;
+                break;
+            }
+        }
+        if (anchor_index == children_.size() || anchor_index == npos)
+            this->add_child(child, add_nl); // if not found, just add to the end
+        const xmlNode *anchor = nth_in_doc_child(anchor_index);
+        if (!anchor)
+            return false;
+        // suppress the anchor
+        remove_child(anchor);
+        // insert new before where the old one was
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::Before, anchor, 0, raw});
+        modified_ = true;
+        children_.back()->append_nl_start = add_nl;
+        append_nl_end = add_nl;
+        return true;
+    }
+    bool xmlNode::insert_child_at_content_offset(size_t rel_offset, std::shared_ptr<xmlNode> child) noexcept
+    {
+        if (content_start_ == npos || content_end_ == npos)
+            return false;
+        size_t content_len = content_end_ - content_start_;
+        if (rel_offset > content_len)
+            rel_offset = content_len; // clamp
+        size_t abs = content_start_ + rel_offset;
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::AtAbs, nullptr, abs, raw});
+        modified_ = true;
+        return true;
+    }
+    bool xmlNode::insert_child_at_start(std::shared_ptr<xmlNode> child) noexcept
+    {
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::AtStart, nullptr, 0, raw});
+        modified_ = true;
+        return true;
+    }
+    bool xmlNode::insert_child_at_end(std::shared_ptr<xmlNode> child) noexcept
+    {
+        auto *raw = child.get();
+        children_.push_back(std::move(child));
+        inserts_.push_back(InsertHint{InsertHint::Where::AtEnd, nullptr, 0, raw});
+        modified_ = true;
+        return true;
+    }
+
+    // -------------------- xmlNode (private helpers) --------------------
+
+    xmlNode::xmlNode(std::shared_ptr<const std::string> doc) : doc_(std::move(doc)) {}
+
+    static inline bool is_space(char c)
+    {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
+    }
+
+    size_t xmlNode::find_first_element_start(const std::string &s, size_t pos)
+    {
+        const size_t N = s.size();
+        while (pos < N)
+        {
+            auto lt = s.find('<', pos);
+            if (lt == npos)
+                return N;
+            if (lt + 1 >= N)
+                return lt;
+            // Skip comments
+            if (lt + 3 < N && s.compare(lt, 4, "<!--") == 0)
+            {
+                size_t end = s.find("-->", lt + 4);
+                pos = (end == npos) ? N : end + 3;
+                continue;
+            }
+            // Skip processing instruction
+            if (s.compare(lt, 2, "<?") == 0)
+            {
+                size_t end = s.find("?>", lt + 2);
+                pos = (end == npos) ? N : end + 2;
+                continue;
+            }
+            // Skip DOCTYPE or other declarations
+            if (s.compare(lt, 2, "<!") == 0 && !(lt + 9 < N && s.compare(lt, 9, "<![CDATA[") == 0))
+            {
+                size_t end = s.find('>', lt + 2);
+                pos = (end == npos) ? N : end + 1;
+                continue;
+            }
+            // CDATA should be part of content, not the first element — skip it here.
+            if (lt + 9 < N && s.compare(lt, 9, "<![CDATA[") == 0)
+            {
+                size_t end = s.find("]]>", lt + 9);
+                pos = (end == npos) ? N : end + 3;
+                continue;
+            }
+            // Real element
+            return lt;
+        }
+        return N;
+    }
+
+    bool xmlNode::skip_comment(const std::string &s, size_t &pos)
+    {
+        if (pos + 3 < s.size() && s.compare(pos, 4, "<!--") == 0)
+        {
+            size_t end = s.find("-->", pos + 4);
+            pos = (end == npos) ? s.size() : end + 3;
+            return true;
+        }
+        return false;
+    }
+
+    bool xmlNode::skip_pi(const std::string &s, size_t &pos)
+    {
+        if (pos + 1 < s.size() && s.compare(pos, 2, "<?") == 0)
+        {
+            size_t end = s.find("?>", pos + 2);
+            pos = (end == npos) ? s.size() : end + 2;
+            return true;
+        }
+        return false;
+    }
+
+    bool xmlNode::skip_doctype(const std::string &s, size_t &pos)
+    {
+        if (pos + 1 < s.size() && s.compare(pos, 2, "<!") == 0 && !(pos + 9 < s.size() && s.compare(pos, 9, "<![CDATA[") == 0))
+        {
+            size_t end = s.find('>', pos + 2);
+            pos = (end == npos) ? s.size() : end + 1;
+            return true;
+        }
+        return false;
+    }
+
+    bool xmlNode::skip_cdata(const std::string &s, size_t &pos)
+    {
+        if (pos + 9 <= s.size() && s.compare(pos, 9, "<![CDATA[") == 0)
+        {
+            size_t end = s.find("]]>", pos + 9);
+            pos = (end == npos) ? s.size() : end + 3; // advance past entire CDATA
+            return true;
+        }
+        return false;
+    }
+
+    void xmlNode::parse_attributes(xmlNode &node, size_t &cur)
+    {
+        const auto &s = *node.doc_;
+        const size_t N = s.size();
+
+        auto skip_ws = [&](size_t &i)
+        {
+            while (i < N && is_space(s[i]))
+                ++i;
+        };
+
+        while (cur < N)
+        {
+            skip_ws(cur);
+            if (cur >= N)
+                break;
+            if (s[cur] == '>')
+            {
+                node.head_end_ = cur;
+                return;
+            }
+            if (s[cur] == '/' && cur + 1 < N && s[cur + 1] == '>')
+            {
+                node.head_end_ = cur + 1;
+                return;
+            }
+
+            // name
+            size_t name_beg = cur;
+            while (cur < N && !is_space(s[cur]) && s[cur] != '=' && s[cur] != '>' && s[cur] != '/')
+                ++cur;
+            size_t name_end = cur;
+
+            skip_ws(cur);
+            if (cur >= N || s[cur] != '=')
+            {
+                // Malformed attribute (no '='). Treat as boolean attribute with empty value.
+                Attr a;
+                a.name_view = std::string_view(s.data() + name_beg, name_end - name_beg);
+                a.value_view = std::string_view{};
+                node.attrs_.push_back(std::move(a));
+                continue;
+            }
+            ++cur; // skip '='
+            skip_ws(cur);
+            if (cur >= N)
+                break;
+
+            char quote = s[cur];
+            if (quote != '"' && quote != '\'')
+            {
+                // Unquoted value (not strictly XML). Read until ws or tag end.
+                size_t val_beg = cur;
+                while (cur < N && !is_space(s[cur]) && s[cur] != '>' && s[cur] != '/')
+                    ++cur;
+                Attr a;
+                a.name_view = std::string_view(s.data() + name_beg, name_end - name_beg);
+                a.value_view = std::string_view(s.data() + val_beg, cur - val_beg);
+                node.attrs_.push_back(std::move(a));
+                continue;
+            }
+
+            ++cur; // after opening quote
+            size_t val_beg = cur;
+            size_t val_end = s.find(quote, cur);
+            if (val_end == npos)
+            {
+                val_end = N;
+                cur = N;
+            }
+            else
+            {
+                cur = val_end + 1;
+            }
+
+            Attr a;
+            a.name_view = std::string_view(s.data() + name_beg, name_end - name_beg);
+            a.value_view = std::string_view(s.data() + val_beg, val_end - val_beg);
+            node.attrs_.push_back(std::move(a));
+        }
+    }
+
+    std::shared_ptr<xmlNode> xmlNode::parse_element(const std::shared_ptr<const std::string> &doc, size_t &pos)
+    {
+        const std::string &s = *doc;
+        const size_t N = s.size();
+        if (pos >= N || s[pos] != '<')
+            return nullptr;
+
+        auto node = std::shared_ptr<xmlNode>(new xmlNode(doc));
+        node->start_ = pos;
+
+        // Read name
+        size_t cur = pos + 1;
+        // skip any whitespace after '<'
+        while (cur < N && is_space(s[cur]))
+            ++cur;
+        size_t name_beg = cur;
+        while (cur < N && !is_space(s[cur]) && s[cur] != '>' && s[cur] != '/')
+            ++cur;
+        size_t name_end = cur;
+
+        node->name_view_ = std::string_view(s.data() + name_beg, name_end - name_beg);
+
+        // Attributes and tag end
+        parse_attributes(*node, cur);
+
+        // Detect "/>" even if there is whitespace before '/', e.g. "<x   />".
+        // head_end_ points at '>' (or at the second char of "/>"), so look at the char before '>'.
+        node->self_closing_ =
+            (node->head_end_ != npos &&
+             node->head_end_ > node->start_ &&
+             (*node->doc_)[node->head_end_ - 1] == '/');
+
+        if (node->self_closing_)
+        {
+            // Treat as empty content; keep offsets consistent
+            node->content_start_ = node->head_end_ + 1;
+            node->content_end_ = node->head_end_ + 1;
+            node->end_ = node->head_end_ + 1;
+            pos = node->end_;
+            return node;
+        }
+
+        // Normal element with content and possibly children
+        node->content_start_ = (node->head_end_ == npos) ? N : node->head_end_ + 1;
+
+        // Scan content interleaving children until matching end tag
+        size_t cursor = node->content_start_;
+        while (cursor < N)
+        {
+            size_t lt = s.find('<', cursor);
+            if (lt == npos)
+            {
+                // Malformed / missing end tag; consume to end
+                node->content_end_ = N;
+                node->end_ = N;
+                pos = N;
+                return node;
+            }
+
+            // Check for closing tag of this node
+            if (lt + 1 < N && s[lt + 1] == '/')
+            {
+                node->content_end_ = lt;
+                // find end of closing tag
+                size_t gt = s.find('>', lt + 2);
+                node->end_ = (gt == npos) ? N : gt + 1;
+                pos = node->end_;
+                return node;
+            }
+
+            // Skippable markup treated as content-only (not nodes here)
+            size_t temp = lt;
+            if (skip_comment(s, temp) || skip_pi(s, temp))
+            {
+                cursor = temp; // comments and PIs aren't children; content continues
+                continue;
+            }
+            if (skip_doctype(s, temp))
+            {
+                cursor = temp;
+                continue;
+            }
+            if (skip_cdata(s, temp))
+            {
+                cursor = temp;
+                continue;
+            } // CDATA becomes raw content
+
+            // Child node
+            size_t child_pos = lt;
+            auto child = parse_element(doc, child_pos);
+            if (!child)
+            {
+                // If parse failed, avoid infinite loop by moving cursor forward
+                cursor = lt + 1;
+                continue;
+            }
+            node->children_.push_back(std::move(child));
+            cursor = child_pos; // parse_element advanced child_pos to end of child
+        }
+
+        // Shouldn't reach here ordinarily
+        node->content_end_ = cursor;
+        node->end_ = cursor;
+        pos = cursor;
+        return node;
+    }
+
+    // -------------------- Writer --------------------
+
+    bool xmlNode::modified_header() const noexcept
+    {
+        if (name_new_.has_value())
+            return true;
+        for (auto const &a : attrs_)
+            if (a.modified())
+                return true;
+        return false;
+    }
+
+    bool xmlNode::modified_footer() const noexcept
+    {
+        return name_new_.has_value();
+    }
+
+    void xmlNode::write_start_tag(std::ostream &os) const
+    {
+        if (!modified_header())
+        {
+            os.write(doc_->data() + start_, static_cast<std::streamsize>((head_end_ + 1) - start_));
+            return;
+        }
+
+        if (append_nl_start)
+            os.put('\n');
+
+        os.put('<');
+        auto nm = name();
+        os.write(nm.data(), static_cast<std::streamsize>(nm.size()));
+        for (auto const &a : attrs_)
+        {
+            os.put(' ');
+            auto an = a.name();
+            os.write(an.data(), static_cast<std::streamsize>(an.size()));
+            os.write("=\"", 2);
+            auto av = a.value();
+            os.write(av.data(), static_cast<std::streamsize>(av.size()));
+            os.put('"');
+        }
+        if (self_closing_)
+        {
+            os.write("/>", 2); // canonicalize to "/>" on rebuild
+        }
+        else
+        {
+            os.put('>');
+        }
+    }
+
+    void xmlNode::write_end_tag(std::ostream &os) const
+    {
+        if (self_closing_)
+            return; // self-closing elements have no end tag
+
+        if (!modified_footer())
+        {
+            size_t tail_from = (content_end_ == npos) ? (head_end_ + 1) : content_end_;
+            if (append_nl_end)
+                os << "\n";
+            os.write(doc_->data() + tail_from, static_cast<std::streamsize>(end_ - tail_from));
+            return;
+        }
+        if (append_nl_end)
+            os << "\n";
+        os.write("</", 2);
+        auto nm = name();
+        os.write(nm.data(), static_cast<std::streamsize>(nm.size()));
+        os.put('>');
+    }
+
+    void xmlNode::write(std::ostream &os) const { write_impl(os, /*as_child=*/false); }
+
+    void xmlNode::write(std::string &out) const
+    {
+        struct StringAppendBuf : std::streambuf
+        {
+            std::string *s;
+            explicit StringAppendBuf(std::string *sp) : s(sp) {}
+            std::streamsize xsputn(const char *p, std::streamsize n) override
+            {
+                s->append(p, size_t(n));
+                return n;
+            }
+            int overflow(int ch) override
+            {
+                if (ch != EOF)
+                    s->push_back(char(ch));
+                return ch;
+            }
+        } buf{&out};
+        std::ostream os(&buf);
+        write_impl(os, /*as_child=*/false);
+    }
+
+    void xmlNode::write_impl(std::ostream &os, bool as_child) const
+    {
+        // Prolog only for root
+        if (!as_child && prolog_end_ > prolog_start_ && prolog_end_ <= start_)
+        {
+            os.write(doc_->data() + prolog_start_, static_cast<std::streamsize>(prolog_end_ - prolog_start_));
+        }
+
+        // Fast path: unmodified leaf
+        if (!modified_ && children_.empty())
+        {
+            os.write(doc_->data() + start_, static_cast<std::streamsize>(end_ - start_));
+            return;
+        }
+
+        // Start tag
+        if (start_ != npos)
+            write_start_tag(os);
+
+        auto is_in_doc = [&](const std::shared_ptr<xmlNode> &c) -> bool
+        {
+            return c->doc_.get() == this->doc_.get() && c->start_ != npos && c->end_ != npos;
+        };
+
+        // NEW: consumed flags for all inserts_ entries (prevents double-emits & infinite loops)
+        std::vector<char> consumed(inserts_.size(), 0);
+
+        // Emit all AtStart inserts (once)
+        for (size_t i = 0; i < inserts_.size(); ++i)
+        {
+            const auto &ins = inserts_[i];
+            if (ins.where == InsertHint::Where::AtStart && !consumed[i])
+            {
+                ins.node->write_impl(os, /*as_child=*/true);
+                consumed[i] = 1;
+            }
+        }
+
+        // Content
+        if (content_start_ != npos && content_end_ != npos)
+        {
+            if (content_writer_)
+            {
+                content_writer_(os);
+            }
+            else if (content_new_)
+            {
+                os.write(content_new_->data(), static_cast<std::streamsize>(content_new_->size()));
+            }
+            else
+            {
+                size_t cursor = content_start_;
+
+                // Helper: flush unconsumed AtAbs inserts in [cursor, limit)
+                auto flush_atabs_until = [&](size_t limit)
+                {
+                    while (true)
+                    {
+                        size_t min_abs = npos;
+                        for (size_t i = 0; i < inserts_.size(); ++i)
+                        {
+                            const auto &ins = inserts_[i];
+                            if (consumed[i])
+                                continue;
+                            if (ins.where != InsertHint::Where::AtAbs)
+                                continue;
+                            if (ins.abs >= cursor && ins.abs < limit)
+                            {
+                                if (min_abs == npos || ins.abs < min_abs)
+                                    min_abs = ins.abs;
+                            }
+                        }
+                        if (min_abs == npos)
+                            break;
+
+                        // write gap up to min_abs
+                        if (cursor < min_abs)
+                        {
+                            os.write(doc_->data() + cursor, static_cast<std::streamsize>(min_abs - cursor));
+                            cursor = min_abs;
+                        }
+                        // emit all AtAbs exactly at min_abs (that are not consumed)
+                        for (size_t i = 0; i < inserts_.size(); ++i)
+                        {
+                            const auto &ins = inserts_[i];
+                            if (!consumed[i] && ins.where == InsertHint::Where::AtAbs && ins.abs == min_abs)
+                            {
+                                ins.node->write_impl(os, /*as_child=*/true);
+                                consumed[i] = 1; // mark consumed to avoid re-finding it
+                            }
+                        }
+                        // keep cursor at min_abs; next loop will look for strictly greater abs
+                    }
+                };
+
+                // Iterate over in-doc children in stored order
+                for (auto const &c : children_)
+                {
+                    if (!is_in_doc(c))
+                        continue;
+
+                    // First flush any AtAbs inserts before this child
+                    flush_atabs_until(c->start_);
+
+                    // Write gap up to the child's start
+                    if (cursor < c->start_)
+                    {
+                        os.write(doc_->data() + cursor, static_cast<std::streamsize>(c->start_ - cursor));
+                        cursor = c->start_;
+                    }
+
+                    // Inserts anchored BEFORE this child (consume them)
+                    for (size_t i = 0; i < inserts_.size(); ++i)
+                    {
+                        const auto &ins = inserts_[i];
+                        if (!consumed[i] && ins.where == InsertHint::Where::Before && ins.anchor == c.get())
+                        {
+                            ins.node->write_impl(os, /*as_child=*/true);
+                            consumed[i] = 1;
+                        }
+                    }
+                    // Also flush AtAbs exactly at child's start (consume them)
+                    for (size_t i = 0; i < inserts_.size(); ++i)
+                    {
+                        const auto &ins = inserts_[i];
+                        if (!consumed[i] && ins.where == InsertHint::Where::AtAbs && ins.abs == c->start_)
+                        {
+                            ins.node->write_impl(os, /*as_child=*/true);
+                            consumed[i] = 1;
+                        }
+                    }
+
+                    // Child itself (unless suppressed)
+                    if (!c->suppressed_)
+                        c->write_impl(os, /*as_child=*/true);
+                    cursor = c->end_;
+
+                    // Inserts anchored AFTER this child (consume them)
+                    for (size_t i = 0; i < inserts_.size(); ++i)
+                    {
+                        const auto &ins = inserts_[i];
+                        if (!consumed[i] && ins.where == InsertHint::Where::After && ins.anchor == c.get())
+                        {
+                            ins.node->write_impl(os, /*as_child=*/true);
+                            consumed[i] = 1;
+                        }
+                    }
+                }
+
+                // Flush remaining AtAbs up to content_end_ (consume)
+                flush_atabs_until(content_end_);
+
+                // Tail gap
+                if (cursor < content_end_)
+                {
+                    os.write(doc_->data() + cursor, static_cast<std::streamsize>(content_end_ - cursor));
+                    cursor = content_end_;
+                }
+
+                // Finally, AtEnd inserts (consume)
+                for (size_t i = 0; i < inserts_.size(); ++i)
+                {
+                    const auto &ins = inserts_[i];
+                    if (!consumed[i] && ins.where == InsertHint::Where::AtEnd)
+                    {
+                        ins.node->write_impl(os, /*as_child=*/true);
+                        consumed[i] = 1;
+                    }
+                }
+            }
+        }
+        // End tag
+        if (end_ != npos)
+            write_end_tag(os);
+    }
+
+    // Deep copy of an xmlNode that creates a new source string
+    // Allows for independent manipulation of the copy,
+    // and allows for storing only certain nodes in memory
+    // rather than the entire document
+    std::shared_ptr<xmlNode> xmlNode::deep_copy() const
+    {
+        std::string copy_content;
+        this->write(copy_content);                // write the full node text into a string
+        auto copy = xmlNode::parse(copy_content); // copy full node text, ie a copy of just the content of this node
+        return copy;
+    }
+
+    bool xmlNode::has_child(std::string_view name) const noexcept
+    {
+        return std::any_of(children_.begin(), children_.end(),
+                           [&](const auto &child)
+                           { return child->name() == name; });
+    }
+
+    std::shared_ptr<xmlNode> xmlNode::get_child(std::string_view name) const noexcept
+    {
+        auto it = std::find_if(children_.begin(), children_.end(),
+                               [&](const auto &child)
+                               { return child->name() == name; });
+        return (it != children_.end()) ? *it : nullptr;
+    }
+
+    std::vector<std::shared_ptr<xmlNode>> xmlNode::get_children(std::string_view name) const noexcept
+    {
+        std::vector<std::shared_ptr<xmlNode>> result;
+        for (const auto &child : children_)
+        {
+            if (child->name() == name)
+            {
+                result.push_back(child);
+            }
+        }
+        return result;
+    }
+
+    bool xmlNode::remove_child(size_t index) noexcept
+    {
+        if (index >= children_.size())
+            return false;
+        children_[index]->suppressed_ = true;
+        modified_ = true;
+        return true;
+    }
+
+    bool xmlNode::remove_child(const xmlNode *child) noexcept
+    {
+        for (auto &c : children_)
+        {
+            if (c.get() == child)
+            {
+                c->suppressed_ = true;
+                modified_ = true;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool xmlNode::remove_child(std::string_view name) noexcept
+    {
+        for (auto &c : children_)
+        {
+            if (c->name() == name)
+            {
+                c->suppressed_ = true;
+                modified_ = true;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    const xmlNode *xmlNode::nth_in_doc_child(size_t ordinal) const noexcept
+    {
+        size_t seen = 0;
+        for (auto const &c : children_)
+        {
+            bool in_doc = (c->doc_.get() == this->doc_.get() && c->start_ != npos && c->end_ != npos);
+            if (!in_doc)
+                continue;
+            if (seen == ordinal)
+                return c.get();
+            ++seen;
+        }
+        return nullptr;
+    }
+
+    std::string read_file(std::string_view path)
+    {
+        constexpr auto read_size = size_t(4096);
+        auto stream = std::ifstream(path.data());
+        stream.exceptions(std::ios_base::badbit);
+        if (not stream)
+        {
+            throw std::ios_base::failure("file does not exist");
+        }
+        auto out = std::string();
+        auto buf = std::string(read_size, '\0');
+        while (stream.read(&buf[0], read_size))
+        {
+            out.append(buf, 0, stream.gcount());
+        }
+        out.append(buf, 0, stream.gcount());
+        return out;
+    }
+
+    std::vector<std::string_view> line_splitter(std::string_view content)
+    {
+        std::vector<std::string_view> lines;
+        size_t start = 0;
+        size_t end = 0;
+        while ((end = content.find('\n', start)) != npos)
+        {
+            lines.push_back(content.substr(start, end - start));
+            start = end + 1;
+        }
+        lines.push_back(content.substr(start));
+        return lines;
+    }
+
+    std::vector<std::string_view> blank_splitter(std::string_view content)
+    {
+        std::vector<std::string_view> words;
+        size_t start = 0;
+        size_t end = 0;
+        while ((end = content.find_first_of(" \n\t", start)) != npos)
+        {
+            if (end > start)
+            {
+                words.push_back(content.substr(start, end - start));
+            }
+            start = end + 1;
+        }
+        if (start < content.size())
+        {
+            words.push_back(content.substr(start));
+        }
+        return words;
+    }
+
+    std::shared_ptr<event> string_to_event(std::string_view content)
+    {
+        auto lines = line_splitter(content);
+        if (lines.empty())
+            return nullptr;
+        if (lines.size() < 2)
+            return nullptr;
+        size_t ind = 0;
+        while (ind < lines.size() && lines[ind].empty())
+            ++ind;
+        if (lines[0].find("<event") != npos)
+            ++ind;
+        auto head_line = blank_splitter(lines[ind]);
+        if (head_line.size() < 6)
+            return nullptr;
+        auto n_prt = ctoi(head_line[0]);
+        if (n_prt <= 0)
+            return nullptr;
+        auto ev = std::make_shared<event>(n_prt);
+        ev->set_proc_id(ctoi(head_line[1])).set_weight(ctod(head_line[2])).set_scale(ctod(head_line[3])).set_alphaEW(ctod(head_line[4])).set_alphaS(ctod(head_line[5]));
+        ++ind;
+        for (auto prt : *ev)
+        {
+            auto words = blank_splitter(lines[ind]);
+            prt.set_pdg(ctoi(words[0])).set_status(ctoi(words[1])).set_mother(ctoi(words[2]), ctoi(words[3])).set_icol(ctoi(words[4]), ctoi(words[5])).set_momentum(ctod(words[9]), ctod(words[6]), ctod(words[7]), ctod(words[8])).set_mass(ctod(words[10])).set_vtim(ctod(words[11])).set_spin(ctod(words[12]));
+            ++ind;
+        }
+        return ev;
+    }
+
+    std::shared_ptr<event> xml_to_event(std::shared_ptr<xmlNode> node)
+    {
+        if (node->name() != "event")
+            return nullptr;
+        auto ev = string_to_event(node->content());
+        if (node->n_children() > 0)
+        {
+            for (auto child : node->children())
+            {
+                if (child->name() == "weights")
+                {
+                    // Handle weights child node
+                    auto wgt_strings = blank_splitter(child->content());
+                    for (const auto &wgt : wgt_strings)
+                    {
+                        ev->add_wgt(ctod(wgt));
+                    }
+                }
+                else if (child->name() == "rwgt")
+                {
+                    // Handle reweight child node
+                    for (auto rwgt_child : child->children())
+                    {
+                        if (rwgt_child->name() == "wgt")
+                        {
+                            ev->add_wgt(ctod(rwgt_child->content()));
+                        }
+                    }
+                }
+                else if (child->name() == "scales")
+                {
+                    for (auto attr : child->attrs())
+                    {
+                        std::string aname(attr.name_view);
+                        std::transform(aname.begin(), aname.end(), aname.begin(), ::tolower);
+                        if (aname == "mur")
+                            ev->set_muR(ctod(attr.value_view));
+                        else if (aname == "muf")
+                            ev->set_muF(ctod(attr.value_view));
+                        else if (aname == "mups")
+                            ev->set_muPS(ctod(attr.value_view));
+                    }
+                }
+                else
+                {
+                    auto copy = child->deep_copy();
+                    ev->extra[std::string(copy->name())] = copy;
+                }
+            }
+        }
+        return ev;
+    }
+
+    initNode string_to_init(std::string_view content)
+    {
+        auto lines = line_splitter(content);
+        size_t ind = 0;
+        while (ind < lines.size() && lines[ind].empty())
+            ++ind;
+        if (lines[0].find("<init") != npos)
+            ++ind;
+        auto head_line = blank_splitter(lines[ind]);
+        auto init = initNode();
+        init.set_idBm(ctoi(head_line[0]), ctoi(head_line[1])).set_eBm(ctod(head_line[2]), ctod(head_line[3])).set_pdfG(ctoi(head_line[4]), ctoi(head_line[5])).set_pdfS(ctod(head_line[6]), ctod(head_line[7])).set_idWgt(ctoi(head_line[8])).set_nProc(ctoi(head_line[9]));
+        ++ind;
+        auto num_proc = init.nProcUP();
+        for (size_t i = 0; i < num_proc; ++i)
+        {
+            auto proc_line = blank_splitter(lines[ind]);
+            init.add_xSec(ctod(proc_line[0])).add_xSecErr(ctod(proc_line[1])).add_xMax(ctod(proc_line[2])).add_lProc(ctoi(proc_line[3]));
+            ++ind;
+        }
+        return init;
+    }
+
+    initNode xml_to_init(std::shared_ptr<xmlNode> node)
+    {
+        auto init = string_to_init(node->content());
+        if (node->n_children() > 0)
+        {
+            for (auto child : node->children())
+            {
+                auto copy = child->deep_copy();
+                init.extra[std::string(copy->name())] = copy;
+            }
+        }
+        return init;
+    }
+
+    std::any xml_to_any(std::shared_ptr<xmlNode> node)
+    {
+        if (!node)
+            return std::nullopt;
+        auto new_node = node->deep_copy();
+        return std::any(new_node);
+    }
+
+    std::shared_ptr<xmlNode> init_to_xml(const initNode &init)
+    {
+        std::string s = write_stream(&initNode::print_init, init);
+        return xmlNode::parse(s);
+    }
+
+    std::shared_ptr<xmlNode> event_to_xml(event &ev)
+    {
+        std::string s = write_stream(&event::print, ev, true);
+        return xmlNode::parse(s);
+    }
+
+    std::optional<std::shared_ptr<xmlNode>> header_to_xml(const std::any &a)
+    {
+        if (!a.has_value())
+            return std::nullopt;
+
+        if (a.type() == typeid(std::shared_ptr<xmlNode>))
+        {
+            auto p = std::any_cast<std::shared_ptr<xmlNode>>(a);
+            if (p)
+                return p;
+            return std::nullopt;
+        }
+        if (a.type() == typeid(std::string))
+        {
+            return xmlNode::parse(std::any_cast<const std::string &>(a));
+        }
+        // Unknown header payload: ignore
+        return std::nullopt;
+    }
+
+    // Explicit XML instantiation definition (prebuilds the specialization)
+    template class lheReader<std::shared_ptr<xmlNode>,
+                             std::shared_ptr<xmlNode>,
+                             std::shared_ptr<xmlNode>>;
+    template class lheWriter<InitToXmlFn, EventToXmlFn, HeaderToXmlFn>;
+
+    const xmlReader &xml_reader()
+    {
+        static const xmlReader b{&xml_to_init, &xml_to_event, &xml_to_any};
+        return b;
+    }
+
+    const xmlWriter &xml_writer()
+    {
+        static const xmlWriter t{&init_to_xml, &event_to_xml, &header_to_xml};
+        return t;
+    }
+
+    xmlRaw to_xml_raw(const lhe &doc)
+    {
+        return xml_writer().to_raw(doc);
+    }
+
+    std::shared_ptr<xmlNode> to_xml(xmlRaw &raw)
+    {
+        std::stringstream ss;
+        ss << "<LesHouchesEvents version=\"3.0\">\n";
+
+        if (raw.header && *raw.header) // first: has optional, second: non-null shared_ptr
+        {
+            (*raw.header)->write(ss);
+            raw.header.reset(); // reset the optional itself
+        }
+
+        raw.init->write(ss);
+        raw.init.reset();
+
+        for (auto &event : raw.events)
+        {
+            event->write(ss);
+            event.reset();
+        }
+
+        ss << "\n</LesHouchesEvents>";
+        return xmlNode::parse(ss.str());
+    }
+
+    std::shared_ptr<xmlNode> to_xml(const lhe &doc)
+    {
+        auto raw = to_xml_raw(doc);
+        return to_xml(raw);
+    }
+
+    lhe to_lhe(std::shared_ptr<xmlNode> node)
+    {
+        auto builder = xml_reader();
+        auto init = node->get_child("init");
+        auto events = node->get_children("event");
+        auto header = std::make_optional(node->get_child("header"));
+        return builder.read(init, events, header);
+    }
+
+    lhe to_lhe(const std::string &xml)
+    {
+        return to_lhe(xmlNode::parse(xml));
+    }
+
+    lhe load_lhef(std::istream &in)
+    {
+        // reads file line by line to avoid issues with large files
+        // starts with reading the header (if it exists) and maps it to an xmlNode
+        // then reads the init node and makes an initNode object
+        // then reads each event node and makes an event object, creating a vector of shared ptrs to events
+        auto events = std::vector<std::shared_ptr<event>>();
+        auto content = std::string();
+        std::string buf;
+        while (buf.empty())
+        {
+            std::getline(in, buf);
+        }
+        if (buf.find("LesHouchesEvents") == npos)
+        {
+            throw std::runtime_error("load_lhef: not a valid LHEF file");
+        }
+        std::shared_ptr<xmlNode> header = nullptr;
+        std::getline(in, buf);
+        if (buf.find("header") != npos)
+        {
+            if (buf.find("</header") != npos || buf.find("/>") != npos)
+            {
+                header = xmlNode::parse(buf);
+            }
+            else
+            {
+                content += buf + "\n";
+                while (std::getline(in, buf))
+                {
+                    content += buf + "\n";
+                    if (buf.find("/header") != npos)
+                        break;
+                }
+                header = xmlNode::parse(content);
+                content.clear();
+            }
+        }
+        while (buf.find("init") == npos)
+        {
+            std::getline(in, buf);
+        }
+        if (buf.find("</init") != npos || buf.find("/>") != npos)
+        {
+            throw std::runtime_error("load_lhef: malformed init block");
+        }
+        content += buf + "\n";
+        while (std::getline(in, buf))
+        {
+            content += buf + "\n";
+            if (buf.find("/init") != npos)
+            {
+                break;
+            }
+        }
+        auto init = xml_to_init(xmlNode::parse(content));
+        content.clear();
+        while (std::getline(in, buf))
+        {
+            if (buf.find("<event") != npos)
+            {
+                if (!content.empty())
+                {
+                    auto curr_event = xml_to_event(xmlNode::parse(content));
+                    if (curr_event)
+                        events.push_back(curr_event);
+                }
+                content.clear();
+                content += buf + "\n";
+            }
+            else if (buf.find("</LesHouchesEvents") != npos)
+            {
+                if (!content.empty())
+                {
+                    auto curr_event = xml_to_event(xmlNode::parse(content));
+                    if (curr_event)
+                        events.push_back(curr_event);
+                }
+                content.clear();
+                break;
+            }
+            else
+            {
+                content += buf + "\n";
+            }
+        }
+        lhe doc(init, events);
+        if (header)
+            doc.set_header(header);
+        return doc;
+    }
+
+    lhe load_lhef(const std::string &filename)
+    {
+        auto stream = std::ifstream(filename);
+        if (!stream)
+            throw std::ios_base::failure("load_lhef: could not open file for reading");
+        return load_lhef(stream);
+    }
+
+    void write_lhef(lhe &doc, std::ostream &out, bool include_weight_ids)
+    {
+        doc.print(out, include_weight_ids);
+    }
+
+    void write_lhef(lhe &doc, const std::string &filename, bool include_weight_ids)
+    {
+        auto stream = std::ofstream(filename);
+        if (!stream)
+            throw std::ios_base::failure("write_lhef: could not open file for writing");
+        write_lhef(doc, stream, include_weight_ids);
+    }
+
+    std::shared_ptr<xmlNode> load_xml(const std::string &filename)
+    {
+        auto xml_content = read_file(filename);
+        return xmlNode::parse(xml_content);
+    }
+
+    std::string slha::upper(std::string s)
+    {
+        std::transform(s.begin(), s.end(), s.begin(),
+                       [](unsigned char c)
+                       { return static_cast<char>(std::toupper(c)); });
+        return s;
+    }
+    static std::string ltrim_(std::string s)
+    {
+        s.erase(s.begin(), std::find_if(s.begin(), s.end(),
+                                        [](unsigned char c)
+                                        { return !std::isspace(c); }));
+        return s;
+    }
+    static std::string rtrim_(std::string s)
+    {
+        s.erase(std::find_if(s.rbegin(), s.rend(),
+                             [](unsigned char c)
+                             { return !std::isspace(c); })
+                    .base(),
+                s.end());
+        return s;
+    }
+    std::string slha::trim(const std::string &s) { return rtrim_(ltrim_(s)); }
+
+    bool slha::starts_with_ci(const std::string &s, const char *prefix)
+    {
+        size_t n = std::char_traits<char>::length(prefix);
+        if (s.size() < n)
+            return false;
+        for (size_t i = 0; i < n; ++i)
+        {
+            if (std::toupper(static_cast<unsigned char>(s[i])) !=
+                std::toupper(static_cast<unsigned char>(prefix[i])))
+                return false;
+        }
+        return true;
+    }
+
+    std::string slha::indices_to_string(std::initializer_list<int> indices)
+    {
+        std::ostringstream os;
+        os << '{';
+        bool first = true;
+        for (int v : indices)
+        {
+            if (!first)
+                os << ',';
+            os << v;
+            first = false;
+        }
+        os << '}';
+        return os.str();
+    }
+
+    void slha::read(std::istream &in)
+    {
+        blocks_.clear();
+        decays_.clear();
+
+        slha::BlockData *current = nullptr;
+
+        std::string raw;
+        while (std::getline(in, raw))
+        {
+            std::string line = trim(raw);
+            if (line.empty())
+                continue;
+
+            // Strip trailing inline comment if any
+            size_t hash = line.find('#');
+            if (hash != npos)
+                line = rtrim_(line.substr(0, hash));
+            if (line.empty())
+                continue;
+
+            if (starts_with_ci(line, "BLOCK"))
+            {
+                std::istringstream ss(line);
+                std::string word, name;
+                ss >> word >> name; // "BLOCK NAME ..."
+                if (name.empty())
+                {
+                    current = nullptr;
+                    continue;
+                }
+                name = upper(name);
+                current = &blocks_[name]; // creates if missing
+                continue;
+            }
+
+            if (starts_with_ci(line, "DECAY"))
+            {
+                current = nullptr;
+                std::istringstream ss(line);
+                std::string w;
+                int pid = 0;
+                double width = 0.0;
+                ss >> w >> pid >> width;
+                if (ss)
+                    decays_[pid] = width; // ignore extras
+                continue;
+            }
+
+            if (current)
+            {
+                // Parse indices... value
+                std::istringstream ss(line);
+                std::vector<std::string> toks;
+                for (std::string t; ss >> t;)
+                    toks.push_back(t);
+                if (toks.size() < 2)
+                    continue; // need at least 1 idx + value
+
+                // value is last token
+                double val = 0.0;
+                try
+                {
+                    val = std::stod(toks.back());
+                }
+                catch (...)
+                {
+                    continue;
+                }
+
+                // preceding tokens must be ints
+                std::vector<int> idx;
+                idx.reserve(toks.size() - 1);
+                bool ok = true;
+                for (size_t i = 0; i + 1 < toks.size(); ++i)
+                {
+                    try
+                    {
+                        idx.push_back(std::stoi(toks[i]));
+                    }
+                    catch (...)
+                    {
+                        ok = false;
+                        break;
+                    }
+                }
+                if (!ok || idx.empty())
+                    continue;
+
+                current->entries[std::move(idx)] = val;
+            }
+        }
+    }
+
+    void slha::write(std::ostream &out,
+                     int value_precision,
+                     bool scientific,
+                     const std::string &indent) const
+    {
+        auto flags_backup = out.flags();
+        auto prec_backup = out.precision();
+
+        if (scientific)
+            out << std::scientific;
+        out << std::setprecision(value_precision);
+
+        // Blocks in lexical order
+        for (const auto &kv : blocks_)
+        {
+            out << "BLOCK " << kv.first << "\n";
+            for (const auto &ev : kv.second.entries)
+            {
+                out << indent;
+                const auto &idx = ev.first;
+                for (size_t i = 0; i < idx.size(); ++i)
+                {
+                    if (i)
+                        out << ' ';
+                    out << idx[i];
+                }
+                out << ' ' << ev.second << "\n";
+            }
+            out << "\n";
+        }
+
+        // Decays in pid order
+        for (const auto &d : decays_)
+        {
+            out << "DECAY " << d.first << ' ' << d.second << "\n";
+        }
+
+        out.flags(flags_backup);
+        out.precision(prec_backup);
+    }
+
+    double slha::get(const std::string &block,
+                     std::initializer_list<int> indices,
+                     double fallback) const
+    {
+        auto itb = blocks_.find(upper(block));
+        if (itb == blocks_.end())
+            return fallback;
+        std::vector<int> key(indices.begin(), indices.end());
+        auto itv = itb->second.entries.find(key);
+        return (itv == itb->second.entries.end()) ? fallback : itv->second;
+    }
+
+    double slha::get(const std::string &block,
+                     int i1,
+                     double fallback) const
+    {
+        return get(block, {i1}, fallback);
+    }
+
+    void slha::set(const std::string &block,
+                   std::initializer_list<int> indices,
+                   double value)
+    {
+        auto &b = blocks_[upper(block)];
+        std::vector<int> key(indices.begin(), indices.end());
+        b.entries[std::move(key)] = value;
+    }
+
+    void slha::set(const std::string &block,
+                   int i1,
+                   double value)
+    {
+        set(block, {i1}, value);
+    }
+
+    double slha::get_decay(int pid, double fallback) const
+    {
+        auto it = decays_.find(pid);
+        return (it == decays_.end()) ? fallback : it->second;
+    }
+
+    void slha::set_decay(int pid, double width)
+    {
+        decays_[pid] = width;
+    }
+
+    bool slha::has_block(const std::string &block) const
+    {
+        return blocks_.count(upper(block)) > 0;
+    }
+
+    bool slha::has_entry(const std::string &block, std::initializer_list<int> indices) const
+    {
+        auto itb = blocks_.find(upper(block));
+        if (itb == blocks_.end())
+            return false;
+        std::vector<int> key(indices.begin(), indices.end());
+        return itb->second.entries.count(key) > 0;
+    }
+
+    slha to_slha(std::istream &in)
+    {
+        return slha::parse(in);
+    }
+
+    slha to_slha(const std::string &slha_text)
+    {
+        std::istringstream iss(slha_text);
+        return to_slha(iss);
+    }
+
+    slha to_slha(std::shared_ptr<xmlNode> node)
+    {
+        if (!node || node->name() != "slha")
+            return slha();
+        std::string content;
+        node->write(content);
+        return to_slha(content);
+    }
+
+    slha to_slha(lhe &doc)
+    {
+        if (doc.header.type() == typeid(slha))
+            return std::any_cast<slha>(doc.header);
+        if (doc.header.type() == typeid(std::shared_ptr<xmlNode>))
+        {
+            auto p = std::any_cast<std::shared_ptr<xmlNode>>(doc.header);
+            if (!p)
+                return slha();
+            auto cont = p->get_child("slha");
+            return to_slha(cont);
+        }
+        if (doc.header.type() == typeid(std::string))
+        {
+            auto head = xmlNode::parse(std::any_cast<std::string>(doc.header));
+            auto p = head->get_child("slha");
+            return to_slha(p);
+        }
+        return slha();
+    }
+
+    slha load_slha(const std::string &filename)
+    {
+        std::ifstream file(filename);
+        if (!file)
+            return slha();
+        return slha::parse(file);
+    }
+
+} // namespace REX
+#endif
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.h b/PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.h
new file mode 100644
index 0000000000..0ae523d958
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/Rex.h
@@ -0,0 +1,3426 @@
+/***
+ *    ______
+ *    | ___ \
+ *    | |_/ /_____  __
+ *    |    // _ \ \/ /
+ *    | |\ \  __/>  <
+ *    \_| \_\___/_/\_\
+ *
+ ***/
+//
+// *R*apid *e*vent e*x*traction Version 1.0.0
+// Rex is a C++ library for parsing and manipulating Les Houches Event-format (LHE) files.
+// It is designed to fast and lightweight, in comparison to internal parsers in programs like MadGraph.
+// Currently, Rex is in development and may not contain all features necessary for full LHE parsing.
+//
+// Copyright © 2023-2025 CERN, CERN Author Zenny Wettersten.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// All rights not expressly granted are reserved.
+//
+
+#ifndef _REX_H_
+#define _REX_H_
+
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <set>
+#include <cmath>
+#include <utility>
+#include <memory>
+#include <map>
+#include <algorithm>
+#include <cctype>
+#include <functional>
+#include <numeric>
+#include <stdexcept>
+#include <unordered_map>
+#include <queue>
+#include <charconv>
+#include <random>
+#include <stdexcept>
+#include <any>
+#include <math.h>
+#include <cctype>
+#include <mutex>
+#include <optional>
+#include <unordered_set>
+#include <type_traits>
+#include <cassert>
+#include <variant>
+
+// Define pi
+constexpr double pi = 3.141592653589793;
+
+// C++17 detection idiom for .size() method
+template <class T, class = void>
+struct has_size : std::false_type
+{
+};
+
+template <class T>
+struct has_size<T, std::void_t<decltype(std::declval<const T &>().size())>>
+    : std::true_type
+{
+};
+
+template <class T>
+constexpr bool has_size_v = has_size<T>::value;
+
+// Detects if a type is std::optional<T>
+template <class T>
+struct _is_optional : std::false_type
+{
+};
+template <class U>
+struct _is_optional<std::optional<U>> : std::true_type
+{
+};
+template <class T>
+constexpr bool _is_optional_v = _is_optional<T>::value;
+
+namespace REX
+{
+
+#define UNUSED(x) (void)(x) // suppress unused variable warnings
+
+    static const size_t npos = (size_t)-1; // generic "not found" value
+
+    std::string to_upper(const std::string &str); // convert string to uppercase
+
+    // generic warning function for printing warnings without throwing
+    void warning(std::string message);
+
+    // free functions / callables that take (ostream&, args...), map to string
+    template <class F, class... Args,
+              std::enable_if_t<!std::is_member_pointer_v<std::decay_t<F>>, int> = 0>
+    std::string write_stream(F &&f, Args &&...args)
+    {
+        std::ostringstream ss;
+        std::invoke(std::forward<F>(f), ss, std::forward<Args>(args)...);
+        return ss.str();
+    }
+
+    // member functions that take (ostream&, args...) on an object, map to string
+    template <class M, class Obj, class... Args,
+              std::enable_if_t<std::is_member_pointer_v<std::decay_t<M>>, int> = 0>
+    std::string write_stream(M mf, Obj &&obj, Args &&...args)
+    {
+        std::ostringstream ss;
+        std::invoke(mf, std::forward<Obj>(obj), ss, std::forward<Args>(args)...);
+        return ss.str();
+    }
+
+    // string trimming function to remove leading whitespace
+    template <typename Str>
+    Str trim_left(Str str)
+    {
+        auto it = str.begin();
+        while (it != str.end() && std::isspace(*it))
+            ++it;
+        return str.substr(std::distance(str.begin(), it));
+    }
+
+    // Generic fcns for converting string-like objects to integers and doubles
+    // Trims leading whitespace and strips leading '+' if present
+    // Note that Str needs to have a .compare(), .data() and .size() method
+    template <typename Str>
+    int ctoi(Str str)
+    {
+        int ret;
+        str = trim_left(str);
+        if (str.compare(0, 1, "+") == 0)
+        {
+            str = str.substr(1);
+        }
+        auto result = std::from_chars(str.data(), str.data() + str.size(), ret);
+        if (result.ec != std::errc())
+        {
+            throw std::invalid_argument("Invalid string-like object to convert to int");
+        }
+        return ret;
+    }
+    extern template int ctoi<std::string>(std::string str);
+    extern template int ctoi<std::string_view>(std::string_view str);
+
+    template <typename Str>
+    double ctod(Str str)
+    {
+        double ret;
+        str = trim_left(str);
+        if (str.compare(0, 1, "+") == 0)
+        {
+            str = str.substr(1);
+        }
+        auto result = std::from_chars(str.data(), str.data() + str.size(), ret);
+        if (result.ec != std::errc())
+        {
+            throw std::invalid_argument("Invalid string-like object to convert to double");
+        }
+        return ret;
+    }
+    extern template double ctod<std::string>(std::string str);
+    extern template double ctod<std::string_view>(std::string_view str);
+
+    std::string read_file(std::string_view path);
+    std::vector<std::string_view> line_splitter(std::string_view content);
+    std::vector<std::string_view> blank_splitter(std::string_view content);
+
+    // ZW: index sorting function, which returns vector
+    // of the indices of the original vector sorted
+    // by default in ascending order
+    // ie, for [5.0, 0.25, 2.0, 9.2] returns [1, 2, 0, 3]
+    template <typename T>
+    std::shared_ptr<std::vector<size_t>> ind_sort(const std::vector<T> &vector, std::function<bool(const T &, const T &)> comp = std::less<T>())
+    {
+        auto sorted = std::make_shared<std::vector<size_t>>(vector.size());
+        std::iota(sorted->begin(), sorted->end(), 0);
+        std::stable_sort(sorted->begin(), sorted->end(), [&](size_t i, size_t j)
+                         { return comp(vector[i], vector[j]); });
+        return sorted;
+    }
+    extern template std::shared_ptr<std::vector<size_t>> ind_sort<int>(const std::vector<int> &vector, std::function<bool(const int &, const int &)> comp = std::less<int>());
+    extern template std::shared_ptr<std::vector<size_t>> ind_sort<double>(const std::vector<double> &vector, std::function<bool(const double &, const double &)> comp = std::less<double>());
+
+    // ZW: templated fcn for multiplying two vectors elementwise,
+    // assuming T has a multiplication operator*
+    template <typename T>
+    std::shared_ptr<std::vector<T>> vec_elem_mult(const std::vector<T> &vec1, const std::vector<T> &vec2)
+    {
+        if (vec1.size() < vec2.size())
+        {
+            return vec_elem_mult(vec2, vec1);
+        }
+        auto valVec = std::make_shared<std::vector<T>>(vec1.size());
+        std::transform(vec1.begin(), vec1.end(), vec2.begin(), valVec->begin(), [](const T &v1, const T &v2)
+                       { return v1 * v2; });
+        return valVec;
+    }
+    extern template std::shared_ptr<std::vector<double>> vec_elem_mult(const std::vector<double> &vec1, const std::vector<double> &vec2);
+    extern template std::shared_ptr<std::vector<float>> vec_elem_mult(const std::vector<float> &vec1, const std::vector<float> &vec2);
+    extern template std::shared_ptr<std::vector<int>> vec_elem_mult(const std::vector<int> &vec1, const std::vector<int> &vec2);
+
+    template <typename T>
+    std::vector<T> subvector(std::vector<T> original, size_t begin, size_t end = npos)
+    {
+        if (end == npos)
+            end = original.size();
+        if (begin > end || end > original.size())
+        {
+            throw std::out_of_range("Invalid subvector range");
+        }
+        return std::vector<T>(original.begin() + begin, original.begin() + end);
+    }
+    extern template std::vector<int> subvector<int>(std::vector<int> original, size_t begin, size_t end);
+    extern template std::vector<size_t> subvector<size_t>(std::vector<size_t> original, size_t begin, size_t end);
+    extern template std::vector<short int> subvector<short int>(std::vector<short int> original, size_t begin, size_t end);
+    extern template std::vector<long int> subvector<long int>(std::vector<long int> original, size_t begin, size_t end);
+    extern template std::vector<double> subvector<double>(std::vector<double> original, size_t begin, size_t end);
+    extern template std::vector<float> subvector<float>(std::vector<float> original, size_t begin, size_t end);
+    extern template std::vector<std::string> subvector<std::string>(std::vector<std::string> original, size_t begin, size_t end);
+    extern template std::vector<std::string_view> subvector<std::string_view>(std::vector<std::string_view> original, size_t begin, size_t end);
+
+    // ================================
+    // arrN<T, N, Align>
+    // -----------------
+    // A generic fixed-size array type,
+    // used in Rex for eg 4-vectors etc
+    // as STL containers do not ensure
+    // contiguity of containers of containers
+    // ================================
+
+    template <typename T, size_t N, size_t Align = alignof(T)>
+    struct alignas(Align) arrN
+    {
+        static_assert(N > 0, "N must be > 0");
+        T data[N];
+
+        constexpr arrN() = default;
+
+        // Fill-ctor: arrN(T v) -> all elements set to v
+        explicit constexpr arrN(T v)
+        {
+            for (size_t i = 0; i < N; ++i)
+                data[i] = v;
+        }
+
+        // Element-wise ctor from N values (enforced via SFINAE)
+        template <class... Args,
+                  typename = std::enable_if_t<sizeof...(Args) == N &&
+                                              std::conjunction<std::is_convertible<Args, T>...>::value>>
+        constexpr arrN(Args &&...args) : data{T(std::forward<Args>(args))...} {}
+
+        // From std::array
+        constexpr arrN(const std::array<T, N> &a)
+        {
+            for (size_t i = 0; i < N; ++i)
+                data[i] = a[i];
+        }
+
+        // Conversion to std::array
+        constexpr operator std::array<T, N>() const
+        {
+            std::array<T, N> a{};
+            for (size_t i = 0; i < N; ++i)
+                a[i] = data[i];
+            return a;
+        }
+
+        // element access
+        constexpr T &operator[](size_t i)
+        {
+            if (i >= N)
+                throw std::out_of_range("arrN[]");
+            return data[i];
+        }
+        constexpr const T &operator[](size_t i) const
+        {
+            if (i >= N)
+                throw std::out_of_range("arrN[]");
+            return data[i];
+        }
+        constexpr T &operator()(size_t i) { return (*this)[i]; }
+        constexpr const T &operator()(size_t i) const { return (*this)[i]; }
+
+        // arithmetic (element-wise)
+        constexpr arrN operator+(const arrN &rhs) const
+        {
+            arrN r;
+            for (size_t i = 0; i < N; ++i)
+                r.data[i] = data[i] + rhs.data[i];
+            return r;
+        }
+        constexpr arrN operator-(const arrN &rhs) const
+        {
+            arrN r;
+            for (size_t i = 0; i < N; ++i)
+                r.data[i] = data[i] - rhs.data[i];
+            return r;
+        }
+        constexpr arrN &operator+=(const arrN &rhs)
+        {
+            for (size_t i = 0; i < N; ++i)
+                data[i] += rhs.data[i];
+            return *this;
+        }
+        constexpr arrN &operator-=(const arrN &rhs)
+        {
+            for (size_t i = 0; i < N; ++i)
+                data[i] -= rhs.data[i];
+            return *this;
+        }
+        constexpr arrN operator*(T scalar) const
+        {
+            arrN r;
+            for (size_t i = 0; i < N; ++i)
+                r.data[i] = data[i] * scalar;
+            return r;
+        }
+        constexpr arrN operator/(T scalar) const
+        {
+            arrN r;
+            for (size_t i = 0; i < N; ++i)
+                r.data[i] = data[i] / scalar;
+            return r;
+        }
+        constexpr arrN &operator*=(T scalar)
+        {
+            for (auto &v : data)
+                v *= scalar;
+            return *this;
+        }
+        constexpr arrN &operator/=(T scalar)
+        {
+            for (auto &v : data)
+                v /= scalar;
+            return *this;
+        }
+
+        // Generic Euclidean dot
+        constexpr T dot_euclidean(const arrN &rhs) const
+        {
+            T s{};
+            for (size_t i = 0; i < N; ++i)
+                s += data[i] * rhs.data[i];
+            return s;
+        }
+
+        // comparisons (lexicographic)
+        constexpr bool operator==(const arrN &rhs) const
+        {
+            for (size_t i = 0; i < N; ++i)
+                if (!(data[i] == rhs.data[i]))
+                    return false;
+            return true;
+        }
+        constexpr bool operator!=(const arrN &rhs) const { return !(*this == rhs); }
+        // Comparison operators --- elementwise, not generally applicable
+        constexpr bool operator<(const arrN &rhs) const
+        {
+            for (size_t i = 0; i < N; ++i)
+                if (data[i] != rhs.data[i])
+                    return data[i] < rhs.data[i];
+            return false;
+        }
+        constexpr bool operator>(const arrN &rhs) const { return rhs < *this; }
+        constexpr bool operator<=(const arrN &rhs) const { return (*this < rhs) || (*this == rhs); }
+        constexpr bool operator>=(const arrN &rhs) const { return (rhs < *this) || (*this == rhs); }
+
+        // 4D-specific named accessors and Minkowski operations (enabled only when N==4)
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr T &t() { return data[0]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr const T &t() const { return data[0]; }
+
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr T &x() { return data[1]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr const T &x() const { return data[1]; }
+
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr T &y() { return data[2]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr const T &y() const { return data[2]; }
+
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr T &z() { return data[3]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr const T &z() const { return data[3]; }
+
+        // Minkowski dot: (+, -, -, -)
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        constexpr T dot(const arrN &other) const
+        {
+            return data[0] * other.data[0] - (data[1] * other.data[1] + data[2] * other.data[2] + data[3] * other.data[3]);
+        }
+
+        // Minkowski norm (sqrt(t^2 - x^2 - y^2 - z^2))
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        T norm() const
+        {
+            using std::sqrt;
+            return sqrt(data[0] * data[0] - (data[1] * data[1] + data[2] * data[2] + data[3] * data[3]));
+        }
+    };
+
+    // ================================
+    // arrNRef<T, N>  (span-like proxy)
+    // --------------------------------
+    // A lightweight proxy for a contiguous block of memory,
+    // used to treat vectors of arrN objects
+    // without needing to copy data when
+    // handling elements of vectors and vice versa
+    // ================================
+    template <typename T, size_t N>
+    struct arrNRef
+    {
+        static_assert(N > 0, "N must be > 0");
+
+        T *p; // points to first of N
+
+        // ctors
+        arrNRef() : p(nullptr) {}
+        explicit arrNRef(T *ptr) : p(ptr) {}
+        arrNRef(const arrNRef &) = default;
+        arrNRef(arrNRef &&) noexcept = default;
+
+        // element access
+        T &operator[](size_t i)
+        {
+            assert(i < N);
+            return p[i];
+        }
+        const T &operator[](size_t i) const
+        {
+            assert(i < N);
+            return p[i];
+        }
+
+        // implicit read conversion to value type
+        operator arrN<std::remove_const_t<T>, N>() const
+        {
+            arrN<std::remove_const_t<T>, N> v;
+            for (size_t i = 0; i < N; ++i)
+                v.data[i] = p[i];
+            return v;
+        }
+
+        // assign from value
+        arrNRef &operator=(const arrN<std::remove_const_t<T>, N> &v)
+        {
+            for (size_t i = 0; i < N; ++i)
+                p[i] = v.data[i];
+            return *this;
+        }
+
+        // assign from std::array
+        arrNRef &operator=(const std::array<std::remove_const_t<T>, N> &a)
+        {
+            for (size_t i = 0; i < N; ++i)
+                p[i] = a[i];
+            return *this;
+        }
+
+        // assign from another proxy (copy elements)
+        arrNRef &operator=(const arrNRef &rhs)
+        {
+            for (size_t i = 0; i < N; ++i)
+                p[i] = rhs[i];
+            return *this;
+        }
+
+        // assign from initializer_list
+        arrNRef &operator=(std::initializer_list<std::remove_const_t<T>> ilist)
+        {
+            assert(ilist.size() == N);
+            auto it = ilist.begin();
+            for (size_t i = 0; i < N; ++i, ++it)
+                p[i] = *it;
+            return *this;
+        }
+
+        // comparisons (value-wise)
+        bool operator==(const arrNRef &other) const
+        {
+            for (size_t i = 0; i < N; ++i)
+                if (!(p[i] == other.p[i]))
+                    return false;
+            return true;
+        }
+        bool operator!=(const arrNRef &other) const { return !(*this == other); }
+
+        bool operator==(const arrN<std::remove_const_t<T>, N> &other) const
+        {
+            for (size_t i = 0; i < N; ++i)
+                if (!(p[i] == other.data[i]))
+                    return false;
+            return true;
+        }
+        bool operator!=(const arrN<std::remove_const_t<T>, N> &other) const { return !(*this == other); }
+
+        bool operator<(const arrNRef &other) const
+        {
+            for (size_t i = 0; i < N; ++i)
+                if (p[i] != other.p[i])
+                    return p[i] < other.p[i];
+            return false;
+        }
+        bool operator<(const arrN<std::remove_const_t<T>, N> &rhs) const
+        {
+            for (size_t i = 0; i < N; ++i)
+                if (p[i] != rhs.data[i])
+                    return p[i] < rhs.data[i];
+            return false;
+        }
+        bool operator<=(const arrNRef &other) const { return (*this < other) || (*this == other); }
+        bool operator>(const arrNRef &other) const { return other < *this; }
+        bool operator>=(const arrNRef &other) const { return (other < *this) || (*this == other); }
+
+        // 4D named accessors when N==4 (optional)
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        T &t() { return p[0]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        T &x() { return p[1]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        T &y() { return p[2]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        T &z() { return p[3]; }
+
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        const T &t() const { return p[0]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        const T &x() const { return p[1]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        const T &y() const { return p[2]; }
+        template <size_t M = N, typename = std::enable_if_t<M == 4>>
+        const T &z() const { return p[3]; }
+    };
+
+    // ================================
+    // nStrideIter<T, N>
+    // -------------------------------
+    // Generic iterator for vectors of
+    // n-dimensional arrays arrN,
+    // ie just arrN<T, N>* pointers
+    // with stride N
+    // ================================
+    template <typename T, size_t N>
+    class nStrideIter
+    {
+    public:
+        using iterator_category = std::random_access_iterator_tag;
+        using value_type = arrN<T, N>;
+        using difference_type = std::ptrdiff_t;
+        using reference = arrNRef<T, N>;
+        using pointer = void;
+
+        nStrideIter() : ptr_(nullptr) {}
+        explicit nStrideIter(T *p) : ptr_(p) {}
+
+        reference operator*() const { return reference{ptr_}; }
+        reference operator[](difference_type k) const { return reference{ptr_ + (k * static_cast<difference_type>(N))}; }
+
+        nStrideIter &operator++()
+        {
+            ptr_ += N;
+            return *this;
+        }
+        nStrideIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        nStrideIter &operator--()
+        {
+            ptr_ -= N;
+            return *this;
+        }
+        nStrideIter operator--(int)
+        {
+            auto tmp = *this;
+            --(*this);
+            return tmp;
+        }
+
+        nStrideIter &operator+=(difference_type k)
+        {
+            ptr_ += k * static_cast<difference_type>(N);
+            return *this;
+        }
+        nStrideIter &operator-=(difference_type k)
+        {
+            ptr_ -= k * static_cast<difference_type>(N);
+            return *this;
+        }
+
+        friend nStrideIter operator+(nStrideIter it, difference_type k)
+        {
+            it += k;
+            return it;
+        }
+        friend nStrideIter operator+(difference_type k, nStrideIter it)
+        {
+            it += k;
+            return it;
+        }
+        friend nStrideIter operator-(nStrideIter it, difference_type k)
+        {
+            it -= k;
+            return it;
+        }
+        friend difference_type operator-(nStrideIter a, nStrideIter b) { return (a.ptr_ - b.ptr_) / static_cast<difference_type>(N); }
+
+        friend bool operator==(nStrideIter a, nStrideIter b) { return a.ptr_ == b.ptr_; }
+        friend bool operator!=(nStrideIter a, nStrideIter b) { return !(a == b); }
+        friend bool operator<(nStrideIter a, nStrideIter b) { return a.ptr_ < b.ptr_; }
+        friend bool operator>(nStrideIter a, nStrideIter b) { return b < a; }
+        friend bool operator<=(nStrideIter a, nStrideIter b) { return !(b < a); }
+        friend bool operator>=(nStrideIter a, nStrideIter b) { return !(a < b); }
+
+    private:
+        T *ptr_;
+    };
+
+    // ================================
+    // vecArrN<T, N>  (flat storage / N-chunk API)
+    // --------------------------------
+    // storage container for arrN types
+    // Uses flat vector storage with
+    // arrNRef access to handle internal data
+    // like arrN objects
+    // Uses nStride as iterator such that
+    // storage traversal goes in steps of N
+    // ================================
+
+    template <typename T, size_t N>
+    class vecArrN
+    {
+    public:
+        using value_type = arrN<T, N>;
+        using size_type = size_t;
+        using reference = arrNRef<T, N>;
+        using const_reference = arrNRef<const T, N>;
+        using iterator = nStrideIter<T, N>;
+        using const_iterator = nStrideIter<const T, N>;
+
+        vecArrN() = default;
+        explicit vecArrN(size_type n_chunks) : flat_(n_chunks * N) {}
+        explicit vecArrN(std::vector<T> flat) : flat_(std::move(flat)) { assert(flat_.size() % N == 0); }
+        template <size_type M>
+        explicit vecArrN(const vecArrN<T, M> &other)
+        {
+            flat_ = other.flat_vector();
+            size_t pad = other.size() * M % N;
+            flat_.resize(flat_.size() + pad, T{});
+        }
+
+        // Construct from sequence of arrN<T, N>
+        vecArrN(std::initializer_list<value_type> init)
+        {
+            flat_.reserve(init.size() * N);
+            for (const auto &q : init)
+                append_chunk(q);
+        }
+
+        // Iterator-based constructor: accepts a range of arrNRef or arrN
+        template <class InputIt,
+                  typename = std::enable_if_t<
+                      !std::is_integral<InputIt>::value &&
+                      std::is_convertible<decltype(*std::declval<InputIt>()), value_type>::value>>
+        vecArrN(InputIt first, InputIt last)
+        {
+            flat_.reserve(std::distance(first, last) * N);
+            for (; first != last; ++first)
+            {
+                value_type q = static_cast<value_type>(*first); // assumes arrNRef<T, N> → arrN<T, N> copy
+                append_chunk(q);
+            }
+        }
+
+        size_type size() const noexcept { return flat_.size() / N; }
+        bool empty() const noexcept { return flat_.empty(); }
+
+        void reserve_chunks(size_type n) { flat_.reserve(n * N); }
+        size_type capacity_chunks() const noexcept { return flat_.capacity() / N; }
+        void resize_chunks(size_type n) { flat_.resize(n * N); }
+
+        reference operator[](size_type i) { return reference{flat_.data() + i * N}; }
+        const_reference operator[](size_type i) const { return const_reference{flat_.data() + i * N}; }
+
+        reference at(size_type i)
+        {
+            if (i >= size())
+                throw std::out_of_range("vecArrN::at");
+            return (*this)[i];
+        }
+        const_reference at(size_type i) const
+        {
+            if (i >= size())
+                throw std::out_of_range("vecArrN::at");
+            return (*this)[i];
+        }
+
+        reference front()
+        {
+            assert(!empty());
+            return reference{flat_.data()};
+        }
+        const_reference front() const
+        {
+            assert(!empty());
+            return const_reference{flat_.data()};
+        }
+        reference back()
+        {
+            assert(!empty());
+            return reference{flat_.data() + (size() - 1) * N};
+        }
+        const_reference back() const
+        {
+            assert(!empty());
+            return const_reference{flat_.data() + (size() - 1) * N};
+        }
+
+        iterator begin() { return iterator(flat_.data()); }
+        iterator end() { return iterator(flat_.data() + flat_.size()); }
+        const_iterator begin() const { return const_iterator(flat_.data()); }
+        const_iterator end() const { return const_iterator(flat_.data() + flat_.size()); }
+        const_iterator cbegin() const { return begin(); }
+        const_iterator cend() const { return end(); }
+
+        void clear() noexcept { flat_.clear(); }
+
+        void push_back(const value_type &q) { append_chunk(q); }
+
+        template <class... Args, typename = std::enable_if_t<sizeof...(Args) == N && std::conjunction<std::is_convertible<Args, T>...>::value>>
+        void push_back(Args &&...args)
+        {
+            value_type q{T(std::forward<Args>(args))...};
+            append_chunk(q);
+        }
+
+        template <class... Args>
+        void emplace_back(Args &&...args)
+        {
+            value_type q(std::forward<Args>(args)...);
+            append_chunk(q);
+        }
+
+        iterator insert(iterator pos, const value_type &q)
+        {
+            auto off = pos - begin();
+            auto it = flat_.insert(flat_.begin() + off * N, q.data, q.data + N);
+            return iterator(flat_.data() + (it - flat_.begin()));
+        }
+
+        iterator erase(iterator pos)
+        {
+            auto off = pos - begin();
+            auto first = flat_.begin() + off * N;
+            auto it = flat_.erase(first, first + N);
+            return iterator(flat_.data() + (it - flat_.begin()));
+        }
+
+        void pop_back()
+        {
+            assert(!empty());
+            flat_.resize(flat_.size() - N);
+        }
+
+        void resize(size_type chunk_count)
+        {
+            flat_.resize(chunk_count * N);
+        }
+
+        void resize(size_type chunk_count, const value_type &chunk_value)
+        {
+            auto old_chunks = size();
+            flat_.resize(chunk_count * N);
+            for (size_type i = old_chunks; i < chunk_count; ++i)
+            {
+                (*this)[i] = chunk_value;
+            }
+        }
+
+        void reserve(size_type chunk_count) { flat_.reserve(chunk_count * N); }
+        size_type capacity() const noexcept { return flat_.capacity() / N; }
+        void shrink_to_fit() { flat_.shrink_to_fit(); }
+        size_type max_size() const noexcept { return flat_.max_size() / N; }
+
+        void assign(size_type chunk_count, const value_type &chunk_value)
+        {
+            flat_.clear();
+            flat_.reserve(chunk_count * N);
+            for (size_type i = 0; i < chunk_count; ++i)
+                append_chunk(chunk_value);
+        }
+
+        template <class InputIt>
+        void assign(InputIt first, InputIt last)
+        {
+            flat_.clear();
+            for (; first != last; ++first)
+            {
+                value_type q = static_cast<value_type>(*first);
+                append_chunk(q);
+            }
+        }
+
+        iterator insert(iterator pos, size_type count, const value_type &chunk_value)
+        {
+            auto off = pos - begin();
+            std::vector<T> tmp;
+            tmp.reserve(count * N);
+            for (size_type i = 0; i < count; ++i)
+            {
+                tmp.insert(tmp.end(), std::begin(chunk_value.data), std::end(chunk_value.data));
+            }
+            auto it = flat_.insert(flat_.begin() + off * N, tmp.begin(), tmp.end());
+            return iterator(flat_.data() + (it - flat_.begin()));
+        }
+
+        template <class InputIt>
+        iterator insert(iterator pos, InputIt first, InputIt last)
+        {
+            auto off = pos - begin();
+            std::vector<T> tmp;
+            for (; first != last; ++first)
+            {
+                value_type q = static_cast<value_type>(*first);
+                tmp.insert(tmp.end(), std::begin(q.data), std::end(q.data));
+            }
+            auto it = flat_.insert(flat_.begin() + off * N, tmp.begin(), tmp.end());
+            return iterator(flat_.data() + (it - flat_.begin()));
+        }
+
+        iterator erase(iterator first, iterator last)
+        {
+            auto off1 = first - begin();
+            auto off2 = last - begin();
+            auto it = flat_.erase(flat_.begin() + off1 * N, flat_.begin() + off2 * N);
+            return iterator(flat_.data() + (it - flat_.begin()));
+        }
+
+        void swap(vecArrN &other) noexcept(noexcept(flat_.swap(other.flat_)))
+        {
+            flat_.swap(other.flat_);
+        }
+
+        bool operator==(const vecArrN &other) const noexcept { return flat_ == other.flat_; }
+        bool operator!=(const vecArrN &other) const noexcept { return !(*this == other); }
+
+        std::vector<T> &flat_vector() noexcept { return flat_; }
+        const std::vector<T> &flat_vector() const noexcept { return flat_; }
+
+        // append from raw memory (unsafe unless count%N==0)
+        void append_flat(const T *data, size_type count)
+        {
+            assert(count % N == 0);
+            flat_.insert(flat_.end(), data, data + count);
+        }
+
+        vecArrN subvec(size_type start_chunk, size_type end_chunk) const
+        {
+            if (start_chunk > end_chunk || end_chunk > size())
+            {
+                throw std::out_of_range("vecArrN::subvec - invalid range");
+            }
+            return vecArrN(begin() + start_chunk, begin() + end_chunk);
+        }
+
+        template <size_t M>
+        vecArrN<T, M> transpose() const
+        {
+            static_assert(M == this->size(), "Invalid transpose size");
+            vecArrN<T, M> result(N);
+            for (size_type i = 0; i < N; ++i)
+            {
+                for (size_type j = 0; j < M; ++j)
+                {
+                    result[j][i] = (*this)[i][j];
+                }
+            }
+            return result;
+        }
+
+    private:
+        void append_chunk(const value_type &q)
+        {
+            flat_.insert(flat_.end(), std::begin(q.data), std::end(q.data));
+        }
+
+        std::vector<T> flat_;
+    };
+
+    template <typename T, size_t N>
+    vecArrN<T, N> subvector(vecArrN<T, N> original, size_t begin, size_t end = npos)
+    {
+        if (end == npos)
+            end = original.size();
+        if (begin > end || end > original.size())
+        {
+            throw std::out_of_range("Invalid subvector range");
+        }
+        return original.subvec(begin, end);
+    }
+
+    // Defining specific instances for arr2, arr3, and arr4
+    template <typename T>
+    using arr2 = arrN<T, 2>;
+    extern template struct arrN<short int, 2>;
+    extern template struct arrN<long int, 2>;
+    extern template struct arrN<int, 2>;
+    extern template struct arrN<float, 2>;
+    extern template struct arrN<double, 2>;
+    template <typename T>
+    using arr3 = arrN<T, 3>;
+    extern template struct arrN<short int, 3>;
+    extern template struct arrN<long int, 3>;
+    extern template struct arrN<int, 3>;
+    extern template struct arrN<float, 3>;
+    extern template struct arrN<double, 3>;
+    template <typename T>
+    using arr4 = arrN<T, 4>;
+    extern template struct arrN<short int, 4>;
+    extern template struct arrN<long int, 4>;
+    extern template struct arrN<int, 4>;
+    extern template struct arrN<float, 4>;
+    extern template struct arrN<double, 4>;
+
+    template <typename T>
+    using arr2Ref = arrNRef<T, 2>;
+    extern template struct arrNRef<short int, 2>;
+    extern template struct arrNRef<long int, 2>;
+    extern template struct arrNRef<int, 2>;
+    extern template struct arrNRef<float, 2>;
+    extern template struct arrNRef<double, 2>;
+    template <typename T>
+    using arr3Ref = arrNRef<T, 3>;
+    extern template struct arrNRef<short int, 3>;
+    extern template struct arrNRef<long int, 3>;
+    extern template struct arrNRef<int, 3>;
+    extern template struct arrNRef<float, 3>;
+    extern template struct arrNRef<double, 3>;
+    template <typename T>
+    using arr4Ref = arrNRef<T, 4>;
+    extern template struct arrNRef<short int, 4>;
+    extern template struct arrNRef<long int, 4>;
+    extern template struct arrNRef<int, 4>;
+    extern template struct arrNRef<float, 4>;
+    extern template struct arrNRef<double, 4>;
+
+    template <typename T>
+    using vecArr2 = vecArrN<T, 2>;
+    extern template struct vecArrN<short int, 2>;
+    extern template struct vecArrN<long int, 2>;
+    extern template struct vecArrN<int, 2>;
+    extern template struct vecArrN<float, 2>;
+    extern template struct vecArrN<double, 2>;
+    template <typename T>
+    using vecArr3 = vecArrN<T, 3>;
+    extern template struct vecArrN<short int, 3>;
+    extern template struct vecArrN<long int, 3>;
+    extern template struct vecArrN<int, 3>;
+    extern template struct vecArrN<float, 3>;
+    extern template struct vecArrN<double, 3>;
+    template <typename T>
+    using vecArr4 = vecArrN<T, 4>;
+    extern template struct vecArrN<short int, 4>;
+    extern template struct vecArrN<long int, 4>;
+    extern template struct vecArrN<int, 4>;
+    extern template struct vecArrN<float, 4>;
+    extern template struct vecArrN<double, 4>;
+
+    // Explicit parton struct for handling particle objects
+    // outside of the view-like type stored in event objects
+    struct parton
+    {
+        arr4<double> momenta_ = {0.0, 0.0, 0.0, 0.0}; // (E, px, py, pz)
+        double mass_ = 0.0;                           // mass
+        double vtim_ = 0.0;                           // lifetime
+        double spin_ = 0.0;                           // spin
+        long int pdg_ = 0;                            // PDG ID
+        short int status_ = 0;                        // status
+        arr2<short int> mother_ = {0, 0};             // mother IDs
+        arr2<short int> icol_ = {0, 0};               // color IDs
+
+        parton() = default;
+        parton(const arr4<double> &mom, double m, double v,
+               double s, long int p, short int st,
+               arr2<short int> &mth, arr2<short int> &col)
+            : momenta_(mom), mass_(m), vtim_(v), spin_(s), pdg_(p), status_(st), mother_(mth), icol_(col) {}
+        parton(arr4<double> &&mom, double m, double v,
+               double s, long int p, short int st,
+               arr2<short int> &&mth, arr2<short int> &&col)
+            : momenta_(std::move(mom)), mass_(m), vtim_(v), spin_(s), pdg_(p), status_(st), mother_(std::move(mth)), icol_(std::move(col)) {}
+        parton(const parton &) = default;
+        parton(parton &&) noexcept = default;
+        parton &operator=(const parton &) = default;
+        parton &operator=(parton &&) noexcept = default;
+
+        // Getters
+        arr4<double> &momenta();
+        const arr4<double> &momenta() const;
+        arr4<double> &momentum();
+        const arr4<double> &momentum() const;
+        arr4<double> &pUP();
+        const arr4<double> &pUP() const;
+        arr4<double> &p();
+        const arr4<double> &p() const;
+        arr4<double> &mom();
+        const arr4<double> &mom() const;
+        double &E();
+        const double &E() const;
+        double &t();
+        const double &t() const;
+        double &px();
+        const double &px() const;
+        double &x();
+        const double &x() const;
+        double &py();
+        const double &py() const;
+        double &y();
+        const double &y() const;
+        double &pz();
+        const double &pz() const;
+        double &z();
+        const double &z() const;
+        double &m();
+        const double &m() const;
+        double &mass();
+        const double &mass() const;
+        double &vtim();
+        const double &vtim() const;
+        double &vTimUP();
+        const double &vTimUP() const;
+        double &spin();
+        const double &spin() const;
+        double &spinUP();
+        const double &spinUP() const;
+        long int &pdg();
+        const long int &pdg() const;
+        long int &idUP();
+        const long int &idUP() const;
+        long int &id();
+        const long int &id() const;
+        short int &status();
+        const short int &status() const;
+        short int &iStUP();
+        const short int &iStUP() const;
+        short int &iSt();
+        const short int &iSt() const;
+        arr2<short int> &mother();
+        const arr2<short int> &mother() const;
+        arr2<short int> &mothUP();
+        const arr2<short int> &mothUP() const;
+        arr2<short int> &moth();
+        const arr2<short int> &moth() const;
+        arr2<short int> &icol();
+        const arr2<short int> &icol() const;
+        arr2<short int> &iColUP();
+        const arr2<short int> &iColUP() const;
+        arr2<short int> &iCol();
+        const arr2<short int> &iCol() const;
+
+        // Self-returning setters
+        parton &set_momenta(const arr4<double> &mom);
+        parton &set_pUP(const arr4<double> &mom);
+        parton &set_p(const arr4<double> &mom);
+        parton &set_mom(const arr4<double> &mom);
+        parton &set_E(double E);
+        parton &set_t(double pt);
+        parton &set_px(double px);
+        parton &set_x(double x);
+        parton &set_py(double py);
+        parton &set_y(double y);
+        parton &set_pz(double pz);
+        parton &set_z(double z);
+        parton &set_mass(double m);
+        parton &set_vtim(double v);
+        parton &set_vTimUP(double v);
+        parton &set_spin(double s);
+        parton &set_spinUP(double s);
+        parton &set_pdg(long int p);
+        parton &set_idUP(long int p);
+        parton &set_id(long int p);
+        parton &set_status(short int st);
+        parton &set_iStUP(short int st);
+        parton &set_iSt(short int st);
+        parton &set_mother(const arr2<short int> &mth);
+        parton &set_mothUP(const arr2<short int> &mth);
+        parton &set_moth(const arr2<short int> &mth);
+        parton &set_mother(const short int m1, const short int m2);
+        parton &set_mothUP(const short int m1, const short int m2);
+        parton &set_moth(const short int m1, const short int m2);
+        parton &set_icol(const arr2<short int> &col);
+        parton &set_iColUP(const arr2<short int> &col);
+        parton &set_icol(const short int c1, const short int c2);
+        parton &set_iColUP(const short int c1, const short int c2);
+        parton &set_iCol(const arr2<short int> &col);
+        parton &set_iCol(const short int c1, const short int c2);
+
+        // Calculated observables
+        double pT() const;    // transverse momentum
+        double pT2() const;   // transverse momentum squared
+        double pL() const;    // longitudinal momentum
+        double pL2() const;   // longitudinal momentum squared
+        double eT() const;    // transverse energy
+        double eT2() const;   // transverse energy squared
+        double phi() const;   // azimuthal angle
+        double theta() const; // polar angle
+        double eta() const;   // pseudorapidity
+        double rap() const;   // rapidity
+        double mT() const;    // transverse mass
+        double mT2() const;   // transverse mass squared
+        double m2() const;    // mass squared
+    };
+
+    struct event
+    {
+    public:
+        // Default constructors
+        event() = default;
+        event(const event &) = default;
+        event(event &&) noexcept = default;
+        event &operator=(const event &) = default;
+        event &operator=(event &&) noexcept = default;
+        // Constructor with number of particles
+        explicit event(size_t n_particles);
+        explicit event(std::vector<parton> particles);
+
+        size_t n_ = 0;                                          // number of partons in the event
+        long int proc_id_ = 0;                                  // process ID
+        double weight_ = 0.0;                                   // event weight
+        double scale_ = 0.0;                                    // event scale
+        double muF_ = 0.0;                                      // factorization scale
+        double muR_ = 0.0;                                      // renormalization scale
+        double muPS_ = 0.0;                                     // parton shower scale
+        double alphaEW_ = 0.0;                                  // electromagnetic coupling constant
+        double alphaS_ = 0.0;                                   // strong coupling constant
+        vecArr4<double> momenta_ = {};                          // momenta of particles (E, px, py, pz)
+        std::vector<double> mass_ = {}, vtim_ = {}, spin_ = {}; // mass, virtual time, and spin
+        std::vector<long int> pdg_ = {};                        // particle ids according to PDG standard
+        std::vector<short int> status_ = {};                    // particle statuses in LHE standard (ie -1 incoming, +1 outgoing etc)
+        vecArr2<short int> mother_ = {}, icol_ = {};            // mother and color indices
+        std::vector<double> wgts_ = {};                         // additional weights, if any; note that wgt ids are not stored at the event level, so custom writers need to handle this at the LHEF level
+
+        // Self-returning setters
+        event &set_n(size_t n);
+        event &set_proc_id(long int id);
+        event &set_weight(double w);
+        event &set_scale(double s);
+        event &set_muF(double muF);
+        event &set_muR(double muR);
+        event &set_muPS(double muPS);
+        event &set_alphaEW(double aew);
+        event &set_alphaS(double as);
+        event &set_momenta(const vecArr4<double> &mom);
+        event &set_momenta(const std::vector<std::array<double, 4>> &mom);
+        event &set_mass(const std::vector<double> &m);
+        event &set_vtim(const std::vector<double> &v);
+        event &set_spin(const std::vector<double> &s);
+        event &set_pdg(const std::vector<long int> &p);
+        event &set_status(const std::vector<short int> &st);
+        event &set_mother(const vecArr2<short int> &m);
+        event &set_mother(const std::vector<std::array<short int, 2>> &m);
+        event &set_icol(const vecArr2<short int> &c);
+        event &set_icol(const std::vector<std::array<short int, 2>> &c);
+        event &set_wgts(const std::vector<double> &w);
+        event &add_wgt(double w, const std::string &id = "");
+
+        std::vector<size_t> indices = {};                                  // indices of particles for ordered views without modifying underlying data
+        event &set_indices();                                              // Default indexing is sequential by storage order
+        event &set_indices(const event &e, bool fail_on_mismatch = false); // Set indices based on another event (fail on mismatch forces events to be equal, throws on miss)
+        event &set_indices(const std::vector<size_t> &idxs);               // Set indices explicitly
+
+        // Access functions for alternative names of variables
+        size_t &nUP();
+        const size_t &nUP() const;
+        size_t &n();
+        const size_t &n() const;
+        long int &idPrUP();
+        const long int &idPrUP() const;
+        long int &idPr();
+        const long int &idPr() const;
+        double &xWgtUP();
+        const double &xWgtUP() const;
+        double &xWgt();
+        const double &xWgt() const;
+        double &weight();
+        const double &weight() const;
+        double &scale();
+        const double &scale() const;
+        double &scalUP();
+        const double &scalUP() const;
+        double &muF();
+        const double &muF() const;
+        double &muR();
+        const double &muR() const;
+        double &muPS();
+        const double &muPS() const;
+        double &aQEDUP();
+        const double &aQEDUP() const;
+        double &alphaQED();
+        const double &alphaQED() const;
+        double &aQED();
+        const double &aQED() const;
+        double &alphaEW();
+        const double &alphaEW() const;
+        double &aEW();
+        const double &aEW() const;
+        double &aQCDUP();
+        const double &aQCDUP() const;
+        double &alphaS();
+        const double &alphaS() const;
+        double &aS();
+        const double &aS() const;
+        double &aQCD();
+        const double &aQCD() const;
+        vecArr4<double> &momenta();
+        const vecArr4<double> &momenta() const;
+        vecArr4<double> &momentum();
+        const vecArr4<double> &momentum() const;
+        vecArr4<double> &pUP();
+        const vecArr4<double> &pUP() const;
+        vecArr4<double> &p();
+        const vecArr4<double> &p() const;
+        std::vector<double> &mUP();
+        const std::vector<double> &mUP() const;
+        std::vector<double> &m();
+        const std::vector<double> &m() const;
+        std::vector<double> &mass();
+        const std::vector<double> &mass() const;
+        std::vector<double> &vtim();
+        const std::vector<double> &vtim() const;
+        std::vector<double> &vTimUP();
+        const std::vector<double> &vTimUP() const;
+        std::vector<double> &vTim();
+        const std::vector<double> &vTim() const;
+        std::vector<double> &spin();
+        const std::vector<double> &spin() const;
+        std::vector<double> &spinUP();
+        const std::vector<double> &spinUP() const;
+        std::vector<long int> &idUP();
+        const std::vector<long int> &idUP() const;
+        std::vector<long int> &id();
+        const std::vector<long int> &id() const;
+        std::vector<long int> &pdg();
+        const std::vector<long int> &pdg() const;
+        std::vector<short int> &iStUP();
+        const std::vector<short int> &iStUP() const;
+        std::vector<short int> &status();
+        const std::vector<short int> &status() const;
+        std::vector<short int> &iSt();
+        const std::vector<short int> &iSt() const;
+        vecArr2<short int> &mother();
+        const vecArr2<short int> &mother() const;
+        vecArr2<short int> &mothUP();
+        const vecArr2<short int> &mothUP() const;
+        vecArr2<short int> &moth();
+        const vecArr2<short int> &moth() const;
+        vecArr2<short int> &icol();
+        const vecArr2<short int> &icol() const;
+        vecArr2<short int> &iColUP();
+        const vecArr2<short int> &iColUP() const;
+        vecArr2<short int> &iCol();
+        const vecArr2<short int> &iCol() const;
+        std::vector<double> &wgts();
+        const std::vector<double> &wgts() const;
+        size_t n_wgts() const;
+
+        // IDs for various additional weights, shared between events (and the LHE struct)
+        std::shared_ptr<std::vector<std::string>> weight_ids = nullptr;
+
+        // Print functions (LHEF XML format)
+        void print_head(std::ostream &os = std::cout) const;
+        void print_wgts_ids(std::ostream &os = std::cout) const;
+        void print_wgts_no_ids(std::ostream &os = std::cout) const;
+        void print_wgts(std::ostream &os = std::cout, bool include_ids = false) const;
+        void print(std::ostream &os = std::cout, bool include_ids = false) const;
+        void print_extra(std::ostream &os = std::cout) const;
+        void print_scales(std::ostream &os = std::cout) const;
+
+        // Calculates gS based on alphaS
+        double gS();
+
+        // Scales (returns LHE scale if not set)
+        double get_muF() const;
+        double get_muR() const;
+        double get_muPS() const;
+
+        // Particle struct, gives a view of the corresponding
+        // elements of the event-level storage vectors
+        struct particle
+        {
+            // Use arrNRef for reference-like access to arrN-like objects
+            // despite being sequences of elements of vecArrN objects
+            arr4Ref<double> momentum_;
+            double &mass_;
+            double &vtim_;
+            double &spin_;
+            long int &pdg_;
+            short int &status_;
+            arr2Ref<short int> mother_;
+            arr2Ref<short int> icol_;
+
+            particle(arr4Ref<double> mom,
+                     double &m, double &v, double &s,
+                     long int &p, short int &st,
+                     arr2Ref<short int> mth,
+                     arr2Ref<short int> col)
+                : momentum_(mom), mass_(m), vtim_(v), spin_(s),
+                  pdg_(p), status_(st), mother_(mth), icol_(col) {}
+
+            particle(arr4Ref<const double> mom,
+                     const double &m, const double &v, const double &s,
+                     const long int &p, const short int &st,
+                     arr2Ref<const short int> mth,
+                     arr2Ref<const short int> col)
+                : momentum_(arr4Ref<double>{const_cast<double *>(mom.p)}),
+                  mass_(const_cast<double &>(m)), vtim_(const_cast<double &>(v)), spin_(const_cast<double &>(s)),
+                  pdg_(const_cast<long int &>(p)), status_(const_cast<short int &>(st)),
+                  mother_(arr2Ref<short int>{const_cast<short int *>(mth.p)}),
+                  icol_(arr2Ref<short int>{const_cast<short int *>(col.p)}) {}
+
+            arr4Ref<double> pUP();
+            arr4Ref<const double> pUP() const;
+            arr4Ref<double> mom();
+            arr4Ref<const double> mom() const;
+            arr4Ref<double> p();
+            arr4Ref<const double> p() const;
+            arr4Ref<double> momentum();
+            arr4Ref<const double> momentum() const;
+
+            // Component aliases
+            double &E();
+            const double &E() const;
+            double &t();
+            const double &t() const;
+            double &x();
+            double &px();
+            const double &x() const;
+            const double &px() const;
+            double &y();
+            double &py();
+            const double &y() const;
+            const double &py() const;
+            double &z();
+            double &pz();
+            const double &z() const;
+            const double &pz() const;
+            double &mUP();
+            const double &mUP() const;
+            double &m();
+            const double &m() const;
+            double &mass();
+            const double &mass() const;
+            double &vtim();
+            const double &vtim() const;
+            double &vTimUP();
+            const double &vTimUP() const;
+            double &vTim();
+            const double &vTim() const;
+            double &spin();
+            const double &spin() const;
+            double &spinUP();
+            const double &spinUP() const;
+            long int &idUP();
+            const long int &idUP() const;
+            long int &id();
+            const long int &id() const;
+            long int &pdg();
+            const long int &pdg() const;
+            short int &status();
+            const short int &status() const;
+            short int &iSt();
+            const short int &iSt() const;
+            short int &iStUP();
+            const short int &iStUP() const;
+            arr2Ref<short int> mothUP();
+            const arr2Ref<short int> mothUP() const;
+            arr2Ref<short int> moth();
+            const arr2Ref<short int> moth() const;
+            arr2Ref<short int> mother();
+            const arr2Ref<short int> mother() const;
+            arr2Ref<short int> icol();
+            const arr2Ref<short int> icol() const;
+            arr2Ref<short int> iColUP();
+            const arr2Ref<short int> iColUP() const;
+            arr2Ref<short int> iCol();
+            const arr2Ref<short int> iCol() const;
+
+            particle &set_pdg(long int p);
+            particle &set_id(long int p);
+            particle &set_idUP(long int p);
+            particle &set_status(short int s);
+            particle &set_iSt(short int s);
+            particle &set_iStUP(short int s);
+            particle &set_mother(short int i, short int j);
+            particle &set_mother(const arr2<short int> &m);
+            particle &set_moth(short int i, short int j);
+            particle &set_moth(const arr2<short int> &m);
+            particle &set_mothUP(short int i, short int j);
+            particle &set_mothUP(const arr2<short int> &m);
+            particle &set_icol(short int i, short int c);
+            particle &set_icol(const arr2<short int> &c);
+            particle &set_iColUP(short int i, short int c);
+            particle &set_iColUP(const arr2<short int> &c);
+            particle &set_iCol(short int i, short int c);
+            particle &set_iCol(const arr2<short int> &c);
+            particle &set_momentum(double e, double px, double py, double pz);
+            particle &set_momentum(const arr4<double> &mom);
+            particle &set_mom(double e, double px, double py, double pz);
+            particle &set_mom(const arr4<double> &mom);
+            particle &set_pUP(double e, double px, double py, double pz);
+            particle &set_pUP(const arr4<double> &mom);
+            particle &set_p(double e, double px, double py, double pz);
+            particle &set_p(const arr4<double> &mom);
+            particle &set_E(double e);
+            particle &set_t(double pt);
+            particle &set_x(double x);
+            particle &set_px(double px);
+            particle &set_y(double y);
+            particle &set_py(double py);
+            particle &set_z(double z);
+            particle &set_pz(double pz);
+            particle &set_mass(double m);
+            particle &set_mUP(double m);
+            particle &set_m(double m);
+            particle &set_vtim(double v);
+            particle &set_vTimUP(double v);
+            particle &set_vTim(double v);
+            particle &set_spin(double s);
+            particle &set_spinUP(double s);
+
+            // Calculated observables
+            double pT() const;    // transverse momentum
+            double pT2() const;   // transverse momentum squared
+            double pL() const;    // longitudinal momentum
+            double pL2() const;   // longitudinal momentum squared
+            double eT() const;    // transverse energy
+            double eT2() const;   // transverse energy squared
+            double phi() const;   // azimuthal angle
+            double theta() const; // polar angle
+            double eta() const;   // pseudorapidity
+            double rap() const;   // rapidity
+            double mT() const;    // transverse mass
+            double mT2() const;   // transverse mass squared
+            double m2() const;    // mass squared
+
+            // Print particle information (LHEF XML format)
+            void print(std::ostream &os = std::cout) const;
+        };
+
+        // Const version of the particle struct
+        struct const_particle
+        {
+            arr4Ref<const double> momentum_;
+            const double &mass_;
+            const double &vtim_;
+            const double &spin_;
+            const long int &pdg_;
+            const short int &status_;
+            arr2Ref<const short int> mother_;
+            arr2Ref<const short int> icol_;
+
+            const_particle(arr4Ref<const double> mom,
+                           const double &m, const double &v, const double &s,
+                           const long int &p, const short int &st,
+                           arr2Ref<const short int> mth,
+                           arr2Ref<const short int> col)
+                : momentum_(mom), mass_(m), vtim_(v), spin_(s),
+                  pdg_(p), status_(st), mother_(mth), icol_(col) {}
+
+            arr4Ref<const double> pUP() const;
+            arr4Ref<const double> mom() const;
+            arr4Ref<const double> p() const;
+            arr4Ref<const double> momentum() const;
+            // Component aliases
+            const double &E() const;
+            const double &t() const;
+            const double &x() const;
+            const double &px() const;
+            const double &y() const;
+            const double &py() const;
+            const double &z() const;
+            const double &pz() const;
+            const double &mUP() const;
+            const double &m() const;
+            const double &mass() const;
+            const double &vtim() const;
+            const double &vTimUP() const;
+            const double &vTim() const;
+            const double &spin() const;
+            const double &spinUP() const;
+            const long int &idUP() const;
+            const long int &id() const;
+            const long int &pdg() const;
+            const short int &status() const;
+            const short int &iSt() const;
+            const short int &iStUP() const;
+            arr2Ref<const short int> mothUP() const;
+            arr2Ref<const short int> moth() const;
+            arr2Ref<const short int> mother() const;
+            arr2Ref<const short int> icol() const;
+            arr2Ref<const short int> iColUP() const;
+            arr2Ref<const short int> iCol() const;
+
+            // Calculated observables
+            double pT() const;    // transverse momentum
+            double pT2() const;   // transverse momentum squared
+            double pL() const;    // longitudinal momentum
+            double pL2() const;   // longitudinal momentum squared
+            double eT() const;    // transverse energy
+            double eT2() const;   // transverse energy squared
+            double phi() const;   // azimuthal angle
+            double theta() const; // polar angle
+            double eta() const;   // pseudorapidity
+            double rap() const;   // rapidity
+            double mT() const;    // transverse mass
+            double mT2() const;   // transverse mass squared
+            double m2() const;    // mass squared
+
+            void print(std::ostream &os = std::cout) const;
+        };
+
+        // Access particle views by indexing event like a vector
+        particle operator[](size_t i);
+        particle at(size_t i);
+        const_particle operator[](size_t i) const;
+        const_particle at(size_t i) const;
+        particle get_particle(size_t i);
+        const_particle get_particle(size_t i) const;
+        size_t size() const;
+
+        struct particle_iterator
+        {
+            event *evt;
+            size_t index;
+            particle operator*();
+            particle_iterator &operator++();
+            bool operator!=(const particle_iterator &) const;
+        };
+
+        struct const_particle_iterator
+        {
+            const event *evt;
+            size_t index;
+            const_particle operator*() const;
+            const_particle_iterator &operator++();
+            bool operator!=(const const_particle_iterator &) const;
+        };
+
+        particle_iterator begin();
+        particle_iterator end();
+        const_particle_iterator begin() const;
+        const_particle_iterator end() const;
+
+        // Add particle to event, sets n = n+1
+        event &add_particle(const parton &p);
+        event &add_particle(const particle &p);
+        event &add_particle(const const_particle &p);
+
+        // Throws if any vector has a mismatched size with each other or nUP
+        void validate() const;
+
+        // Generic additional tags/information
+        std::unordered_map<std::string, std::any> extra;
+
+        // Set generic data (overwrites if exists)
+        // Note: std::any is used to allow any type of value to be stored
+        // This is a simple key-value store for data, not part of the LHEF standard
+        template <typename T>
+        void set(const std::string &name, T value)
+        {
+            extra[name] = std::any(std::move(value));
+        }
+
+        // Get extra data (throws if not found or wrong type)
+        template <typename T>
+        T &get(const std::string &name)
+        {
+            auto it = extra.find(name);
+            if (it == extra.end())
+            {
+                throw std::out_of_range("event::get: No parameter named '" + name + "'");
+            }
+            if (it->second.type() != typeid(T))
+            {
+                throw std::runtime_error("event::get: Parameter '" + name + "' is not of requested type");
+            }
+            return std::any_cast<T &>(it->second);
+        }
+
+        template <typename T>
+        const T &get(const std::string &name) const
+        {
+            auto it = extra.find(name);
+            if (it == extra.end())
+            {
+                throw std::out_of_range("event::get: No parameter named '" + name + "'");
+            }
+            if (it->second.type() != typeid(T))
+            {
+                throw std::bad_any_cast("event::get: Parameter '" + name + "' is not of requested type");
+            }
+            return std::any_cast<const T &>(it->second);
+        }
+
+        bool has(const std::string &name) const
+        {
+            return extra.find(name) != extra.end();
+        }
+
+        // Internal view for event to iterate over particles as indices,
+        // ie allows treating the event as a collection of particles
+        // according to the indices vector ordering
+        struct event_view
+        {
+            event &evt;
+            const std::vector<size_t> &indices;
+
+            struct iterator
+            {
+                event &evt;
+                const std::vector<size_t> &indices;
+                size_t i;
+
+                iterator(event &evt_, const std::vector<size_t> &indices_, size_t idx)
+                    : evt(evt_), indices(indices_), i(idx) {}
+
+                event::particle operator*()
+                {
+                    return evt.get_particle(indices[i]);
+                }
+
+                iterator &operator++()
+                {
+                    ++i;
+                    return *this;
+                }
+
+                bool operator!=(const iterator &other) const
+                {
+                    return i != other.i || &indices != &other.indices;
+                }
+            };
+
+            iterator begin() { return iterator{evt, indices, 0}; }
+            iterator end() { return iterator{evt, indices, indices.size()}; }
+
+            size_t size() const { return indices.size(); }
+
+            event::particle operator[](size_t i)
+            {
+                return evt.get_particle(indices[i]);
+            }
+        };
+
+        struct const_event_view
+        {
+            const event &evt;
+            const std::vector<size_t> &indices;
+
+            struct iterator
+            {
+                const event &evt;
+                const std::vector<size_t> &indices;
+                size_t i;
+
+                iterator(const event &evt_, const std::vector<size_t> &indices_, size_t idx)
+                    : evt(evt_), indices(indices_), i(idx) {}
+
+                event::const_particle operator*() const
+                {
+                    return evt.get_particle(indices[i]);
+                }
+
+                iterator &operator++()
+                {
+                    ++i;
+                    return *this;
+                }
+
+                bool operator!=(const iterator &other) const
+                {
+                    return i != other.i || &indices != &other.indices;
+                }
+            };
+
+            iterator begin() const { return iterator{evt, indices, 0}; }
+            iterator end() const { return iterator{evt, indices, indices.size()}; }
+
+            size_t size() const { return indices.size(); }
+
+            event::const_particle operator[](size_t i) const
+            {
+                return evt.get_particle(indices[i]);
+            }
+        };
+
+        event_view view()
+        {
+            if (this->indices.empty())
+                this->set_indices();
+            return event_view{*this, indices};
+        }
+
+        const_event_view view() const
+        {
+            if (this->indices.empty())
+                const_cast<event *>(this)->set_indices();
+            return const_event_view{*this, indices};
+        }
+    };
+
+    // Event comparator type
+    using event_equal_fn = std::function<bool(event &, event &)>;
+    using cevent_equal_fn = std::function<bool(const event &, const event &)>;
+
+    // Global access
+    bool default_event_equal(const event &lhs, const event &rhs);
+
+    // Custom comparator interface
+    bool operator==(const event &lhs, const event &rhs);
+    bool operator!=(const event &lhs, const event &rhs);
+    void set_event_comparator(cevent_equal_fn fn);
+    void reset_event_comparator();
+    bool external_legs_comparator(event &a, event &b);
+    bool external_legs_const_comparator(const event &a, const event &b);
+    bool always_true(const event &a, const event &b);
+
+    // Class to create custom event comparators
+    // This class allows users to define custom comparison logic for events
+    // by specifying which fields to compare and their tolerances.
+    // It supports comparison for all standard LHEF fields, and will automatically
+    // sort particle-level fields using exclusively the fields specified in the configuration.
+    // Additionally, the status_filter variable allows for defining which particle statuses to extract for comparison.
+    // For doubles, relative tolerances can be set independently for each field by the user.
+    // However, for integers, only exact equality is supported.
+    struct eventComparatorConfig
+    {
+        // Status filter: only compare particles with one of these statuses
+        std::set<int> status_filter = {}; // empty = no filtering
+
+        bool compare_momentum = false;
+        // per-momentum component toggles
+        bool compare_momentum_x = false;
+        bool compare_momentum_y = false;
+        bool compare_momentum_z = false;
+        bool compare_momentum_E = false;
+
+        bool compare_mass = true;
+        bool compare_vtim = false;
+        bool compare_spin = false;
+        bool compare_pdg = true;
+        bool compare_status = true;
+        bool compare_mother = false;
+        bool compare_icol = false;
+
+        bool compare_n = false;
+        bool compare_proc_id = false;
+        bool compare_weight = false;
+        bool compare_scale = false;
+        bool compare_alphaEW = false;
+        bool compare_alphaS = false;
+
+        double mass_tol = 1e-8;
+        double vtim_tol = 1e-8;
+        double spin_tol = 1e-8;
+        double momentum_tol = 1e-8;
+        double weight_tol = 1e-8;
+        double scale_tol = 1e-8;
+        double alphaEW_tol = 1e-8;
+        double alphaS_tol = 1e-8;
+
+        event_equal_fn make_comparator() const;
+        cevent_equal_fn make_const_comparator() const;
+        // Convenience functions to set individual parameters
+        // Event-level parameters
+        eventComparatorConfig &set_n(bool v)
+        {
+            compare_n = v;
+            return *this;
+        }
+        eventComparatorConfig &set_nUP(bool v)
+        {
+            compare_n = v;
+            return *this;
+        }
+        eventComparatorConfig &set_proc_id(bool v)
+        {
+            compare_proc_id = v;
+            return *this;
+        }
+        eventComparatorConfig &set_idPr(bool v)
+        {
+            compare_proc_id = v;
+            return *this;
+        }
+        eventComparatorConfig &set_idPrUP(bool v)
+        {
+            compare_proc_id = v;
+            return *this;
+        }
+        eventComparatorConfig &set_weight(bool v)
+        {
+            compare_weight = v;
+            return *this;
+        }
+        eventComparatorConfig &set_xWgt(bool v)
+        {
+            compare_weight = v;
+            return *this;
+        }
+        eventComparatorConfig &set_xWgtUP(bool v)
+        {
+            compare_weight = v;
+            return *this;
+        }
+        eventComparatorConfig &set_scale(bool v)
+        {
+            compare_scale = v;
+            return *this;
+        }
+        eventComparatorConfig &set_scalUP(bool v)
+        {
+            compare_scale = v;
+            return *this;
+        }
+        eventComparatorConfig &set_alphaEW(bool v)
+        {
+            compare_alphaEW = v;
+            return *this;
+        }
+        eventComparatorConfig &set_aQED(bool v)
+        {
+            compare_alphaEW = v;
+            return *this;
+        }
+        eventComparatorConfig &set_aQEDUP(bool v)
+        {
+            compare_alphaEW = v;
+            return *this;
+        }
+        eventComparatorConfig &set_alphaQED(bool v)
+        {
+            compare_alphaEW = v;
+            return *this;
+        }
+        eventComparatorConfig &set_aEW(bool v)
+        {
+            compare_alphaEW = v;
+            return *this;
+        }
+        eventComparatorConfig &set_alphaS(bool v)
+        {
+            compare_alphaS = v;
+            return *this;
+        }
+        eventComparatorConfig &set_aQCD(bool v)
+        {
+            compare_alphaS = v;
+            return *this;
+        }
+        eventComparatorConfig &set_aQCDUP(bool v)
+        {
+            compare_alphaS = v;
+            return *this;
+        }
+        eventComparatorConfig &set_aS(bool v)
+        {
+            compare_alphaS = v;
+            return *this;
+        }
+        // Particle-specific parameters
+        eventComparatorConfig &set_momentum(bool v)
+        {
+            compare_momentum = v;
+            compare_momentum_x = v;
+            compare_momentum_y = v;
+            compare_momentum_z = v;
+            compare_momentum_E = v;
+            return *this;
+        }
+        eventComparatorConfig &set_pUP(bool v)
+        {
+            compare_momentum = v;
+            compare_momentum_x = v;
+            compare_momentum_y = v;
+            compare_momentum_z = v;
+            compare_momentum_E = v;
+            return *this;
+        }
+        eventComparatorConfig &set_p(bool v)
+        {
+            compare_momentum = v;
+            compare_momentum_x = v;
+            compare_momentum_y = v;
+            compare_momentum_z = v;
+            compare_momentum_E = v;
+            return *this;
+        }
+        eventComparatorConfig &set_momenta(bool v)
+        {
+            compare_momentum = v;
+            compare_momentum_x = v;
+            compare_momentum_y = v;
+            compare_momentum_z = v;
+            compare_momentum_E = v;
+            return *this;
+        }
+        eventComparatorConfig &set_mom(bool v)
+        {
+            compare_momentum = v;
+            compare_momentum_x = v;
+            compare_momentum_y = v;
+            compare_momentum_z = v;
+            compare_momentum_E = v;
+            return *this;
+        }
+        eventComparatorConfig &set_momentum(bool e, bool x, bool y, bool z)
+        {
+            compare_momentum = e || x || y || z;
+            compare_momentum_E = e;
+            compare_momentum_x = x;
+            compare_momentum_y = y;
+            compare_momentum_z = z;
+            return *this;
+        }
+        eventComparatorConfig &set_momenta(bool e, bool x, bool y, bool z)
+        {
+            compare_momentum = e || x || y || z;
+            compare_momentum_E = e;
+            compare_momentum_x = x;
+            compare_momentum_y = y;
+            compare_momentum_z = z;
+            return *this;
+        }
+        eventComparatorConfig &set_pUP(bool e, bool x, bool y, bool z)
+        {
+            compare_momentum = e || x || y || z;
+            compare_momentum_E = e;
+            compare_momentum_x = x;
+            compare_momentum_y = y;
+            compare_momentum_z = z;
+            return *this;
+        }
+        eventComparatorConfig &set_p(bool e, bool x, bool y, bool z)
+        {
+            compare_momentum = e || x || y || z;
+            compare_momentum_E = e;
+            compare_momentum_x = x;
+            compare_momentum_y = y;
+            compare_momentum_z = z;
+            return *this;
+        }
+        eventComparatorConfig &set_mom(bool e, bool x, bool y, bool z)
+        {
+            compare_momentum = e || x || y || z;
+            compare_momentum_E = e;
+            compare_momentum_x = x;
+            compare_momentum_y = y;
+            compare_momentum_z = z;
+            return *this;
+        }
+        eventComparatorConfig &set_E(bool v)
+        {
+            compare_momentum_E = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_t(bool v)
+        {
+            compare_momentum_E = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_x(bool v)
+        {
+            compare_momentum_x = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_px(bool v)
+        {
+            compare_momentum_x = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_y(bool v)
+        {
+            compare_momentum_y = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_py(bool v)
+        {
+            compare_momentum_y = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_z(bool v)
+        {
+            compare_momentum_z = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_pz(bool v)
+        {
+            compare_momentum_z = v;
+            compare_momentum = compare_momentum_x || compare_momentum_y || compare_momentum_z || compare_momentum_E;
+            return *this;
+        }
+        eventComparatorConfig &set_mass(bool v)
+        {
+            compare_mass = v;
+            return *this;
+        }
+        eventComparatorConfig &set_m(bool v)
+        {
+            compare_mass = v;
+            return *this;
+        }
+        eventComparatorConfig &set_mUP(bool v)
+        {
+            compare_mass = v;
+            return *this;
+        }
+        eventComparatorConfig &set_vtim(bool v)
+        {
+            compare_vtim = v;
+            return *this;
+        }
+        eventComparatorConfig &set_vTim(bool v)
+        {
+            compare_vtim = v;
+            return *this;
+        }
+        eventComparatorConfig &set_vTimUP(bool v)
+        {
+            compare_vtim = v;
+            return *this;
+        }
+        eventComparatorConfig &set_spin(bool v)
+        {
+            compare_spin = v;
+            return *this;
+        }
+        eventComparatorConfig &set_spinUP(bool v)
+        {
+            compare_spin = v;
+            return *this;
+        }
+        eventComparatorConfig &set_pdg(bool v)
+        {
+            compare_pdg = v;
+            return *this;
+        }
+        eventComparatorConfig &set_id(bool v)
+        {
+            compare_pdg = v;
+            return *this;
+        }
+        eventComparatorConfig &set_idUP(bool v)
+        {
+            compare_pdg = v;
+            return *this;
+        }
+        eventComparatorConfig &set_status(bool v)
+        {
+            compare_status = v;
+            return *this;
+        }
+        eventComparatorConfig &set_iSt(bool v)
+        {
+            compare_status = v;
+            return *this;
+        }
+        eventComparatorConfig &set_iStUP(bool v)
+        {
+            compare_status = v;
+            return *this;
+        }
+        eventComparatorConfig &set_mother(bool v)
+        {
+            compare_mother = v;
+            return *this;
+        }
+        eventComparatorConfig &set_moth(bool v)
+        {
+            compare_mother = v;
+            return *this;
+        }
+        eventComparatorConfig &set_mothUP(bool v)
+        {
+            compare_mother = v;
+            return *this;
+        }
+        eventComparatorConfig &set_icol(bool v)
+        {
+            compare_icol = v;
+            return *this;
+        }
+        eventComparatorConfig &set_iCol(bool v)
+        {
+            compare_icol = v;
+            return *this;
+        }
+        eventComparatorConfig &set_iColUP(bool v)
+        {
+            compare_icol = v;
+            return *this;
+        }
+
+        eventComparatorConfig &set_status_filter(std::vector<int> v)
+        {
+            status_filter = std::set<int>(v.begin(), v.end());
+            return *this;
+        }
+        eventComparatorConfig &set_status_filter(std::set<int> s)
+        {
+            status_filter = std::move(s);
+            return *this;
+        }
+        template <typename... Args>
+        eventComparatorConfig &set_status_filter(Args... args)
+        {
+            status_filter = std::set<int>{args...};
+            return *this;
+        }
+        eventComparatorConfig &set_tolerance(double tol)
+        {
+            mass_tol = tol;
+            momentum_tol = tol;
+            vtim_tol = tol;
+            spin_tol = tol;
+            weight_tol = tol;
+            scale_tol = tol;
+            alphaEW_tol = tol;
+            alphaS_tol = tol;
+            return *this;
+        }
+    };
+
+    eventComparatorConfig compare_legs_only();
+    eventComparatorConfig compare_final_state_only();
+    eventComparatorConfig compare_physics_fields();
+
+    using event_bool_fn = std::function<bool(event &)>;        // boolean function type
+    using cevent_bool_fn = std::function<bool(const event &)>; // boolean function type
+
+    // Struct to test whether an event belongs to a set of events
+    struct eventBelongs
+    {
+    public:
+        std::vector<std::shared_ptr<event>> events = {};
+        event_equal_fn comparator = external_legs_comparator;
+        cevent_equal_fn const_comparator = external_legs_const_comparator;
+        // Default constructors
+        eventBelongs() = default;
+        eventBelongs(const eventBelongs &) = default;
+        eventBelongs(eventBelongs &&) noexcept = default;
+        eventBelongs &operator=(const eventBelongs &) = default;
+        eventBelongs &operator=(eventBelongs &&) noexcept = default;
+        // Constructor with one event
+        explicit eventBelongs(const event &e);
+        explicit eventBelongs(std::shared_ptr<event> e);
+        // Constructor with multiple events
+        explicit eventBelongs(std::vector<event> evts);
+        explicit eventBelongs(std::vector<std::shared_ptr<event>> evts);
+        // Constructors with comparator
+        explicit eventBelongs(const event &e, event_equal_fn comp);
+        explicit eventBelongs(std::shared_ptr<event> e, event_equal_fn comp);
+        explicit eventBelongs(std::vector<event> evts, event_equal_fn comp);
+        explicit eventBelongs(std::vector<std::shared_ptr<event>> evts, event_equal_fn comp);
+        explicit eventBelongs(const event &e, cevent_equal_fn comp);
+        explicit eventBelongs(std::shared_ptr<event> e, cevent_equal_fn comp);
+        explicit eventBelongs(std::vector<event> evts, cevent_equal_fn comp);
+        explicit eventBelongs(std::vector<std::shared_ptr<event>> evts, cevent_equal_fn comp);
+        explicit eventBelongs(const event &e, event_equal_fn comp, cevent_equal_fn ccomp);
+        explicit eventBelongs(std::shared_ptr<event> e, event_equal_fn comp, cevent_equal_fn ccomp);
+        explicit eventBelongs(std::vector<event> evts, event_equal_fn comp, cevent_equal_fn ccomp);
+        explicit eventBelongs(std::vector<std::shared_ptr<event>> evts, event_equal_fn comp, cevent_equal_fn ccomp);
+        // Add an event to the set
+        eventBelongs &add_event(const event &e);
+        eventBelongs &add_event(std::shared_ptr<event> e);
+        eventBelongs &add_event(const std::vector<event> &evts);
+        eventBelongs &add_event(std::vector<std::shared_ptr<event>> evts);
+        // Self-returning setting functions
+        eventBelongs &set_events(const event &e);
+        eventBelongs &set_events(std::shared_ptr<event> e);
+        eventBelongs &set_events(const std::vector<event> &evts);
+        eventBelongs &set_events(std::vector<std::shared_ptr<event>> evts);
+        eventBelongs &set_comparator(event_equal_fn comp);
+        eventBelongs &set_comparator(cevent_equal_fn comp);
+        eventBelongs &set_comparator(const eventComparatorConfig &cfg);
+        // Check if an event belongs to the set
+        bool belongs_mutable(event &e);
+        bool belongs_const(const event &e) const;
+        bool belongs(std::shared_ptr<event> e);
+        bool belongs(event &e);
+        bool belongs(const event &e) const;
+        // Overload parenthesis operator for easy usage
+        bool operator()(event &e)
+        {
+            return belongs(e);
+        }
+        bool operator()(const event &e) const
+        {
+            return belongs(e);
+        }
+        event_bool_fn get_event_bool();
+        cevent_bool_fn get_const_event_bool() const;
+    };
+
+    eventBelongs all_events_belong();
+
+    using event_hash_fn = std::function<size_t(event &)>;        // hash function type
+    using cevent_hash_fn = std::function<size_t(const event &)>; // hash function type
+
+    // Struct to sort events by their belonging to sets of events
+    struct eventSorter
+    {
+        std::vector<std::shared_ptr<eventBelongs>> event_sets = {};
+        std::vector<event_bool_fn> comparators = {};
+        std::vector<cevent_bool_fn> const_comparators = {};
+        // Default constructors
+        eventSorter() = default;
+        eventSorter(const eventSorter &) = default;
+        eventSorter(eventSorter &&) noexcept = default;
+        eventSorter &operator=(const eventSorter &) = default;
+        eventSorter &operator=(eventSorter &&) noexcept = default;
+        // Constructor with one event set
+        explicit eventSorter(const eventBelongs &e_set);
+        explicit eventSorter(event_bool_fn comp);
+        explicit eventSorter(cevent_bool_fn comp);
+        explicit eventSorter(event_bool_fn comp, cevent_bool_fn ccomp);
+        // Constructor with multiple event sets
+        explicit eventSorter(std::vector<eventBelongs> e_sets);
+        explicit eventSorter(std::vector<event_bool_fn> comps);
+        explicit eventSorter(std::vector<event_bool_fn> comps, std::vector<cevent_bool_fn> ccomps);
+        void extract_comparators();
+        // Add an event set to the sorter
+        eventSorter &add_event_set(const eventBelongs &e_set);
+        eventSorter &add_event_set(const std::vector<eventBelongs> &e_sets);
+        eventSorter &add_bool(event_bool_fn comp);
+        eventSorter &add_const_bool(cevent_bool_fn comp);
+        eventSorter &add_bool(event_bool_fn comp, cevent_bool_fn ccomp);
+        eventSorter &add_bool(std::vector<event_bool_fn> comps);
+        eventSorter &add_const_bool(std::vector<cevent_bool_fn> ccomps);
+        eventSorter &add_bool(std::vector<event_bool_fn> comps, std::vector<cevent_bool_fn> ccomps);
+        // Self-returning setting functions
+        eventSorter &set_event_sets(const eventBelongs &e_set);
+        eventSorter &set_event_sets(const std::vector<eventBelongs> &e_sets);
+        eventSorter &set_bools(const event_bool_fn comp);
+        eventSorter &set_const_bools(const cevent_bool_fn comp);
+        eventSorter &set_bools(const event_bool_fn comp, const cevent_bool_fn ccomp);
+        eventSorter &set_bools(const std::vector<event_bool_fn> comps);
+        eventSorter &set_const_bools(const std::vector<cevent_bool_fn> comps);
+        eventSorter &set_bools(const std::vector<event_bool_fn> comps, const std::vector<cevent_bool_fn> ccomps);
+        // size() function just returns the number of event sets
+        size_t size() const;
+        // Function to find the position of an event in the sorter, returns npos if not found
+        size_t position(event &e);
+        size_t position(const event &e) const;
+        size_t position(std::shared_ptr<event> e);
+        std::vector<size_t> position(std::vector<event> &evts);
+        std::vector<size_t> position(const std::vector<event> &evts) const;
+        std::vector<size_t> position(std::vector<std::shared_ptr<event>> evts);
+        std::vector<size_t> sort(std::vector<event> &evts);
+        std::vector<size_t> sort(const std::vector<event> &evts) const;
+        std::vector<size_t> sort(std::vector<std::shared_ptr<event>> evts);
+        event_hash_fn get_hash();
+        cevent_hash_fn get_const_hash() const;
+    };
+
+    eventSorter make_sample_sorter(std::vector<event> sample, event_equal_fn comp = external_legs_comparator);
+    eventSorter make_sample_sorter(std::vector<std::shared_ptr<event>> sample, event_equal_fn comp = external_legs_comparator);
+
+    struct process
+    {
+        // Default constructors
+        process() = default;
+        process(const process &) = default;
+        process(process &&) noexcept = default;
+        process &operator=(const process &) = default;
+        process &operator=(process &&) noexcept = default;
+        explicit process(std::vector<std::shared_ptr<event>> evts, bool filter_partons = false);
+        explicit process(std::vector<event> evts, bool filter_partons = false);
+
+        process &add_event(const event &e);
+        process &add_event(std::shared_ptr<event> e);
+        process &add_event_raw(const event &e);
+        process &add_event_raw(std::shared_ptr<event> e);
+        process &add_event_filtered(const event &e);
+        process &add_event_filtered(std::shared_ptr<event> e);
+        process &add_event(const std::vector<event> &evts);
+        process &add_event(std::vector<std::shared_ptr<event>> evts);
+
+        // LHEF data, vectorised
+        // Note that vecors of vectors are not contiguous in memory
+        std::vector<size_t> n_ = {};         // number of partons per event
+        std::vector<size_t> n_summed = {};   // number of (summed) partons per event (ie n_summed[i] = sum(n[0:i]))
+        std::vector<long int> proc_id_ = {}; // process IDs
+        std::vector<double> weight_ = {};    // event weights
+        std::vector<double> scale_ = {};     // event scales
+        std::vector<double> muF_ = {};       // factorization scales
+        std::vector<double> muR_ = {};       // renormalization scales
+        std::vector<double> muPS_ = {};      // parton shower scales
+        std::vector<double> alphaEW_ = {};   // electromagnetic coupling constants
+        std::vector<double> alphaS_ = {};    // strong coupling constants
+        vecArr4<double> momenta_ = {};
+        std::vector<double> mass_ = {}, vtim_ = {}, spin_ = {};
+        std::vector<long int> pdg_ = {};
+        std::vector<short int> status_ = {};
+        vecArr2<short int> mother_ = {}, icol_ = {};
+        std::vector<std::vector<double>> wgts_ = {}; // additional weights, if any; note that wgt ids are not stored at the event level, so custom
+        std::unordered_map<std::string, std::vector<std::any>> extra;
+
+        bool filter = false; // whether to extract data using raw data or event_view
+
+        // Accessors
+        std::vector<size_t> &n() { return n_; }
+        const std::vector<size_t> &n() const { return n_; }
+        std::vector<size_t> &nUP() { return n_; }
+        const std::vector<size_t> &nUP() const { return n_; }
+        std::vector<long int> &idPrUP() { return proc_id_; }
+        const std::vector<long int> &idPrUP() const { return proc_id_; }
+        std::vector<long int> &proc_id() { return proc_id_; }
+        const std::vector<long int> &proc_id() const { return proc_id_; }
+        std::vector<long int> &idPr() { return proc_id_; }
+        const std::vector<long int> &idPr() const { return proc_id_; }
+        std::vector<double> &weight() { return weight_; }
+        const std::vector<double> &weight() const { return weight_; }
+        std::vector<double> &xWgtUP() { return weight_; }
+        const std::vector<double> &xWgtUP() const { return weight_; }
+        std::vector<double> &scale() { return scale_; }
+        const std::vector<double> &scale() const { return scale_; }
+        std::vector<double> &scalUP() { return scale_; }
+        const std::vector<double> &scalUP() const { return scale_; }
+        std::vector<double> &muF() { return muF_; }
+        const std::vector<double> &muF() const { return muF_; }
+        std::vector<double> &muR() { return muR_; }
+        const std::vector<double> &muR() const { return muR_; }
+        std::vector<double> &muPS() { return muPS_; }
+        const std::vector<double> &muPS() const { return muPS_; }
+        std::vector<double> &aQEDUP() { return alphaEW_; }
+        const std::vector<double> &aQEDUP() const { return alphaEW_; }
+        std::vector<double> &aQED() { return alphaEW_; }
+        const std::vector<double> &aQED() const { return alphaEW_; }
+        std::vector<double> &aEW() { return alphaEW_; }
+        const std::vector<double> &aEW() const { return alphaEW_; }
+        std::vector<double> &alphaEW() { return alphaEW_; }
+        const std::vector<double> &alphaEW() const { return alphaEW_; }
+        std::vector<double> &alphaS() { return alphaS_; }
+        const std::vector<double> &alphaS() const { return alphaS_; }
+        std::vector<double> &aQCDUP() { return alphaS_; }
+        const std::vector<double> &aQCDUP() const { return alphaS_; }
+        std::vector<double> &aQCD() { return alphaS_; }
+        const std::vector<double> &aQCD() const { return alphaS_; }
+        std::vector<double> &aS() { return alphaS_; }
+        const std::vector<double> &aS() const { return alphaS_; }
+
+        vecArr4<double> &pUP() { return momenta_; }
+        const vecArr4<double> &pUP() const { return momenta_; }
+        vecArr4<double> &mom() { return momenta_; }
+        const vecArr4<double> &mom() const { return momenta_; }
+        vecArr4<double> &p() { return momenta_; }
+        const vecArr4<double> &p() const { return momenta_; }
+        vecArr4<double> &momentum() { return momenta_; }
+        const vecArr4<double> &momentum() const { return momenta_; }
+        vecArr4<double> &momenta() { return momenta_; }
+        const vecArr4<double> &momenta() const { return momenta_; }
+        std::vector<double> &mUP() { return mass_; }
+        const std::vector<double> &mUP() const { return mass_; }
+        std::vector<double> &m() { return mass_; }
+        const std::vector<double> &m() const { return mass_; }
+        std::vector<double> &mass() { return mass_; }
+        const std::vector<double> &mass() const { return mass_; }
+        std::vector<double> &vtim() { return vtim_; }
+        const std::vector<double> &vtim() const { return vtim_; }
+        std::vector<double> &vTimUP() { return vtim_; }
+        const std::vector<double> &vTimUP() const { return vtim_; }
+        std::vector<double> &vTim() { return vtim_; }
+        const std::vector<double> &vTim() const { return vtim_; }
+        std::vector<double> &spin() { return spin_; }
+        const std::vector<double> &spin() const { return spin_; }
+        std::vector<double> &spinUP() { return spin_; }
+        const std::vector<double> &spinUP() const { return spin_; }
+        std::vector<long int> &idUP() { return pdg_; }
+        const std::vector<long int> &idUP() const { return pdg_; }
+        std::vector<long int> &pdg() { return pdg_; }
+        const std::vector<long int> &pdg() const { return pdg_; }
+        std::vector<long int> &id() { return pdg_; }
+        const std::vector<long int> &id() const { return pdg_; }
+        std::vector<short int> &status() { return status_; }
+        const std::vector<short int> &status() const { return status_; }
+        std::vector<short int> &iSt() { return status_; }
+        const std::vector<short int> &iSt() const { return status_; }
+        std::vector<short int> &iStUP() { return status_; }
+        const std::vector<short int> &iStUP() const { return status_; }
+        vecArr2<short int> &mother() { return mother_; }
+        const vecArr2<short int> &mother() const { return mother_; }
+        vecArr2<short int> &moth() { return mother_; }
+        const vecArr2<short int> &moth() const { return mother_; }
+        vecArr2<short int> &mothUP() { return mother_; }
+        const vecArr2<short int> &mothUP() const { return mother_; }
+        vecArr2<short int> &iColUP() { return icol_; }
+        const vecArr2<short int> &iColUP() const { return icol_; }
+        vecArr2<short int> &iCol() { return icol_; }
+        const vecArr2<short int> &iCol() const { return icol_; }
+        vecArr2<short int> &icol() { return icol_; }
+        const vecArr2<short int> &icol() const { return icol_; }
+
+        std::vector<std::vector<double>> &wgtUP() { return wgts_; }
+        const std::vector<std::vector<double>> &wgtUP() const { return wgts_; }
+        std::vector<std::vector<double>> &wgts() { return wgts_; }
+        const std::vector<std::vector<double>> &wgts() const { return wgts_; }
+
+        std::vector<double> &get_muF()
+        {
+            // if muF empty, set it equal to scale before returning
+            if (muF_.empty())
+                muF_ = scale_;
+            return muF_;
+        }
+
+        std::vector<double> &get_muR()
+        {
+            // if muR empty, set it equal to scale before returning
+            if (muR_.empty())
+                muR_ = scale_;
+            return muR_;
+        }
+
+        std::vector<double> &get_muPS()
+        {
+            // if muPS empty, set it equal to scale before returning
+            if (muPS_.empty())
+                muPS_ = scale_;
+            return muPS_;
+        }
+
+        // Specific momenta components --- not references!
+        // When accessing specific momentum components like this,
+        // the data is copied and then needs to be overwritten
+        // in the original process object using the set_* functions
+        std::vector<double> E();
+        std::vector<double> t();
+        std::vector<double> x();
+        std::vector<double> px();
+        std::vector<double> y();
+        std::vector<double> py();
+        std::vector<double> z();
+        std::vector<double> pz();
+        process &set_E(const std::vector<double> &E);
+        process &set_t(const std::vector<double> &pt);
+        process &set_x(const std::vector<double> &x);
+        process &set_px(const std::vector<double> &px);
+        process &set_y(const std::vector<double> &y);
+        process &set_py(const std::vector<double> &py);
+        process &set_z(const std::vector<double> &z);
+        process &set_pz(const std::vector<double> &pz);
+
+        std::vector<double> gS();
+        process &set_gS(const std::vector<double> &gS);
+
+        // vector of events (can be overwritten!)
+        // For LHE objects, the events are shared between
+        // the process objects and the owning LHE object
+        std::vector<std::shared_ptr<event>> events = {};
+
+        // Self-returning setting functions
+        process &set_n(const std::vector<size_t> &n);
+        process &set_n_summed(const std::vector<size_t> &n_summed);
+        process &set_proc_id(const std::vector<long int> &proc_id);
+        process &set_weight(const std::vector<double> &weight);
+        process &set_scale(const std::vector<double> &scale);
+        process &set_muF(const std::vector<double> &muF);
+        process &set_muR(const std::vector<double> &muR);
+        process &set_muPS(const std::vector<double> &muPS);
+        process &set_alphaEW(const std::vector<double> &alphaEW);
+        process &set_alphaS(const std::vector<double> &alphaS);
+        process &set_momenta(const vecArr4<double> &momenta);
+        process &set_mass(const std::vector<double> &mass);
+        process &set_vtim(const std::vector<double> &vtim);
+        process &set_spin(const std::vector<double> &spin);
+        process &set_pdg(const std::vector<long int> &pdg);
+        process &set_status(const std::vector<short int> &status);
+        process &set_mother(const vecArr2<short int> &mother);
+        process &set_icol(const vecArr2<short int> &icol);
+        process &set_wgts(const std::vector<std::vector<double>> &wgts);
+        process &append_wgts(const std::vector<double> &wgts);
+        process &add_extra(const std::string &name, const std::any &value);
+        process &add_extra(const std::unordered_map<std::string, std::any> &values);
+        process &add_extra(const std::string &name, const std::vector<std::any> &values);
+        process &add_extra(const std::unordered_map<std::string, std::vector<std::any>> &values);
+        process &set_extra(const std::unordered_map<std::string, std::vector<std::any>> &values);
+
+        // Flag whether to pull event data according to internal
+        // storage or to event_view indexing
+        process &set_filter(bool v);
+
+        // Throws if any vector has a mismatched size with each other or nUP
+        void validate() const;
+
+        // Functions for total transposition to the events vector
+        void make_event(size_t idx);
+        void transpose();
+
+        // Partial transposition functions, moving a single parameter into each event in the events vector
+        process &transpose_n();
+        process &transpose_nUP();
+        process &transpose_proc_id();
+        process &transpose_idPrUP();
+        process &transpose_idPr();
+        process &transpose_weight();
+        process &transpose_xWgtUP();
+        process &transpose_xWgt();
+        process &transpose_scale();
+        process &transpose_scalUP();
+        process &transpose_muF();
+        process &transpose_muR();
+        process &transpose_muPS();
+        process &transpose_alphaEW();
+        process &transpose_aQEDUP();
+        process &transpose_aQED();
+        process &transpose_alphaS();
+        process &transpose_aQCDUP();
+        process &transpose_aQCD();
+        process &transpose_momenta();
+        process &transpose_pUP();
+        process &transpose_mom();
+        process &transpose_p();
+        process &transpose_mass();
+        process &transpose_mUP();
+        process &transpose_m();
+        process &transpose_vtim();
+        process &transpose_vTimUP();
+        process &transpose_spin();
+        process &transpose_spinUP();
+        process &transpose_pdg();
+        process &transpose_idUP();
+        process &transpose_id();
+        process &transpose_status();
+        process &transpose_iStUP();
+        process &transpose_iSt();
+        process &transpose_mother();
+        process &transpose_mothUP();
+        process &transpose_icol();
+        process &transpose_iColUP();
+        process &transpose_wgts();
+        process &transpose_extra();
+
+        // Specific momentum component transpositions
+        process &transpose_E();
+        process &transpose_t();
+        process &transpose_x();
+        process &transpose_px();
+        process &transpose_y();
+        process &transpose_py();
+        process &transpose_z();
+        process &transpose_pz();
+    };
+
+    // Init class
+    // Primarily used to make it possible to split init node information
+    // from the rest of the LHE information without needing to
+    // complicate the logic in readers and writers
+    struct initNode
+    {
+        // Default constructors
+        initNode() = default;
+        initNode(const initNode &) = default;
+        initNode(initNode &&) noexcept = default;
+        initNode &operator=(const initNode &) = default;
+        initNode &operator=(initNode &&) noexcept = default;
+        // Custom constructors
+        initNode(short unsigned int nproc);
+        initNode(size_t nproc);
+
+        arr2<long int> idBm_ = {0, 0};  // beam IDs
+        arr2<double> eBm_ = {0.0, 0.0}; // beam energies
+        arr2<short int> pdfG_ = {0, 0}; // PDF group IDs
+        arr2<long int> pdfS_ = {0, 0};  // PDF set IDs
+        short int idWgt_ = 0;           // weight ID
+        short unsigned int nProc_ = 0;  // number of processes
+
+        std::vector<double> xSec_ = {};    // cross sections
+        std::vector<double> xSecErr_ = {}; // cross section errors
+        std::vector<double> xMax_ = {};    // maximum weights
+        std::vector<long int> lProc_ = {}; // process IDs
+
+        // Access functions for various alternative names of the variables above, all passed as references
+        arr2<long int> &idBmUP() { return idBm_; }
+        const arr2<long int> &idBmUP() const { return idBm_; }
+        arr2<long int> &idBm() { return idBm_; }
+        const arr2<long int> &idBm() const { return idBm_; }
+        arr2<double> &eBmUP() { return eBm_; }
+        const arr2<double> &eBmUP() const { return eBm_; }
+        arr2<double> &eBm() { return eBm_; }
+        const arr2<double> &eBm() const { return eBm_; }
+        arr2<short int> &pdfGUP() { return pdfG_; }
+        const arr2<short int> &pdfGUP() const { return pdfG_; }
+        arr2<short int> &pdfG() { return pdfG_; }
+        const arr2<short int> &pdfG() const { return pdfG_; }
+        arr2<long int> &pdfSUP() { return pdfS_; }
+        const arr2<long int> &pdfSUP() const { return pdfS_; }
+        arr2<long int> &pdfS() { return pdfS_; }
+        const arr2<long int> &pdfS() const { return pdfS_; }
+        short int &idWgtUP() { return idWgt_; }
+        const short int &idWgtUP() const { return idWgt_; }
+        short int &idWgt() { return idWgt_; }
+        const short int &idWgt() const { return idWgt_; }
+        short unsigned int &nProcUP() { return nProc_; }
+        const short unsigned int &nProcUP() const { return nProc_; }
+        short unsigned int &nProc() { return nProc_; }
+        const short unsigned int &nProc() const { return nProc_; }
+        std::vector<double> &xSecUP() { return xSec_; }
+        const std::vector<double> &xSecUP() const { return xSec_; }
+        std::vector<double> &xSec() { return xSec_; }
+        const std::vector<double> &xSec() const { return xSec_; }
+        std::vector<double> &xSecErrUP() { return xSecErr_; }
+        const std::vector<double> &xSecErrUP() const { return xSecErr_; }
+        std::vector<double> &xSecErr() { return xSecErr_; }
+        const std::vector<double> &xSecErr() const { return xSecErr_; }
+        std::vector<double> &xMaxUP() { return xMax_; }
+        const std::vector<double> &xMaxUP() const { return xMax_; }
+        std::vector<double> &xMax() { return xMax_; }
+        const std::vector<double> &xMax() const { return xMax_; }
+        std::vector<long int> &lProcUP() { return lProc_; }
+        const std::vector<long int> &lProcUP() const { return lProc_; }
+        std::vector<long int> &lProc() { return lProc_; }
+        const std::vector<long int> &lProc() const { return lProc_; }
+
+        initNode &set_idBm(const arr2<long int> &ids);
+        initNode &set_idBm(long int id1, long int id2);
+        initNode &set_eBm(const arr2<double> &energies);
+        initNode &set_eBm(double e1, double e2);
+        initNode &set_pdfG(const arr2<short int> &ids);
+        initNode &set_pdfG(short int id1, short int id2);
+        initNode &set_pdfS(const arr2<long int> &ids);
+        initNode &set_pdfS(long int id1, long int id2);
+        initNode &set_idWgt(short int id);
+        initNode &set_nProc(short unsigned int n);
+        initNode &set_xSec(const std::vector<double> &xsec);
+        initNode &set_xSecErr(const std::vector<double> &xsec_err);
+        initNode &set_xMax(const std::vector<double> &xmax);
+        initNode &set_lProc(const std::vector<long int> &lproc);
+        initNode &add_xSec(double xsec);
+        initNode &add_xSecErr(double xsec_err);
+        initNode &add_xMax(double xmax);
+        initNode &add_lProc(long int lproc);
+
+        void validate_init() const;
+
+        void print_head(std::ostream &os = std::cout) const;
+        void print_body(std::ostream &os = std::cout) const;
+        void print_extra(std::ostream &os = std::cout) const;
+        void print_init(std::ostream &os = std::cout) const;
+
+        // Generic additional tags/information
+        std::unordered_map<std::string, std::any> extra;
+
+        // Set generic data (overwrites if exists)
+        // Note: std::any is used to allow any type of value to be stored
+        // This is a simple key-value store for data, not part of the LHEF standard
+        template <typename T>
+        void set(const std::string &name, T value)
+        {
+            extra[name] = std::any(std::move(value));
+        }
+
+        // Get extra data (throws if not found or wrong type)
+        template <typename T>
+        T &get(const std::string &name)
+        {
+            auto it = extra.find(name);
+            if (it == extra.end())
+            {
+                throw std::out_of_range("No parameter named '" + name + "'");
+            }
+            return std::any_cast<T &>(it->second);
+        }
+
+        template <typename T>
+        const T &get(const std::string &name) const
+        {
+            auto it = extra.find(name);
+            if (it == extra.end())
+            {
+                throw std::out_of_range("No parameter named '" + name + "'");
+            }
+            return std::any_cast<const T &>(it->second);
+        }
+
+        bool has(const std::string &name) const
+        {
+            return extra.find(name) != extra.end();
+        }
+    };
+
+    // Data-driven LHE struct
+    // Contains both the object oriented event representation
+    // and the data-oriented SoA process representation,
+    // plus the additional information stored in the init node
+    // Does not have a defined header object aside from a generic std::any which
+    // needs to be unpacked manually if used, although weight ids as in the initrwgt node
+    // can be stored in a vector of strings (without any weight group splittings)
+    // and can be passed on to the event nodes for writing weights with ids in the <rwgt> format
+    struct lhe : public initNode
+    {
+        // Default constructors
+        lhe() = default;
+        lhe(const lhe &) = default;
+        lhe(lhe &&) noexcept = default;
+        lhe &operator=(const lhe &) = default;
+        lhe &operator=(lhe &&) noexcept = default;
+        // Custom constructors
+        explicit lhe(const initNode &i) : initNode(i) {};
+        explicit lhe(std::vector<std::shared_ptr<event>> evts);
+        explicit lhe(std::vector<event> evts);
+        explicit lhe(const initNode &i, std::vector<std::shared_ptr<event>> evts);
+        explicit lhe(const initNode &i, std::vector<event> evts);
+
+        std::any header;
+
+        bool filter_processes = true;
+
+        eventSorter sorter;
+        event_hash_fn event_hash = nullptr; // hash function for events (ie sorter)
+
+        std::vector<size_t> process_order = {}; // mapping from events to processes
+
+        std::vector<std::shared_ptr<event>> events = {};                // vector of events
+        std::vector<std::vector<std::shared_ptr<event>>> sorted_events; // vector of vectors of events, sorted according to the processes scheme
+        std::vector<std::shared_ptr<process>> processes = {};           // vector of processes
+
+        std::shared_ptr<std::vector<std::string>> weight_ids = std::make_shared<std::vector<std::string>>(); // weight ids for the <rwgt> block, if any
+
+        std::vector<std::string> weight_context = {}; // context strings for each weight, if any
+
+        // Self-returning setting functions
+        lhe &set_events(std::vector<std::shared_ptr<event>> evts);
+        lhe &set_processes(std::vector<std::shared_ptr<process>> procs);
+        lhe &set_header(std::any hdr);
+        lhe &add_event(std::shared_ptr<event> evt);
+        lhe &add_event(const event &evt);
+        lhe &set_sorter();
+        lhe &set_sorter(event_equal_fn comp);
+        lhe &set_sorter(cevent_equal_fn comp);
+        lhe &set_sorter(const eventSorter &s);
+        void extract_hash();
+        lhe &set_hash(event_hash_fn hash);
+        lhe &set_filter(bool v);
+        lhe &set_weight_ids(const std::vector<std::string> &ids);
+        lhe &set_weight_ids(std::vector<std::string> &&ids);
+        lhe &set_weight_ids(std::shared_ptr<std::vector<std::string>> ids);
+        lhe &add_weight_id(const std::string &id);
+        lhe &add_weight_id(std::string &&id);
+
+        void sort_events_mutable();
+        void sort_events_const();
+
+        void sort_events();
+        void unsort_events();
+        void events_to_processes();
+        void processes_to_events();
+        void transpose();
+        void transpose(std::string dir);
+
+        void extract_weight_ids();
+        void sync_weight_ids();
+        void append_weight_ids(bool include = false);
+
+        void print_header(std::ostream &os = std::cout) const;
+        void print(std::ostream &os = std::cout, bool include_ids = false);
+    };
+
+    // Classes for XML handling
+    // xmlDoc: Owns the entire XML text buffer. Nodes keep this alive via shared_ptr.
+    class xmlDoc
+    {
+    public:
+        xmlDoc() = default;
+        explicit xmlDoc(std::string xml);
+        const std::string &str() const noexcept;
+        std::string_view view() const noexcept;
+        std::shared_ptr<const std::string> shared() const noexcept;
+
+    private:
+        std::shared_ptr<std::string> buf_{std::make_shared<std::string>()};
+    };
+
+    // Attribute with copy-on-write mutability.
+    struct Attr
+    {
+        // Original views into the mother buffer:
+        std::string_view name_view{};
+        std::string_view value_view{};
+
+        // Only allocated if modified:
+        std::optional<std::string> name_new{};
+        std::optional<std::string> value_new{};
+
+        std::string_view name() const noexcept;
+        std::string_view value() const noexcept;
+        bool modified() const noexcept;
+    };
+
+    // xmlNode: Main XML node representation
+    // Handles parsing, writing, access etc
+    class xmlNode
+    {
+    public:
+        xmlNode();
+        ~xmlNode();
+
+        // Parse from an owning string (keeps only one owned copy).
+        static std::shared_ptr<xmlNode> parse(std::string xml);
+
+        // Parse from an already-shared buffer.
+        static std::shared_ptr<xmlNode> parse(const std::shared_ptr<const std::string> &buf);
+
+        std::string_view name() const noexcept;
+        std::string_view full() const noexcept;    // full slice [start_, end_)
+        std::string_view content() const noexcept; // [content_start_, content_end_)
+        const std::vector<Attr> &attrs() const noexcept;
+        std::vector<std::shared_ptr<xmlNode>> &children();
+
+        bool modified(bool deep = false) const noexcept;
+        bool is_leaf() const noexcept;
+
+        // Offsets into the mother buffer:
+        size_t start() const noexcept { return start_; }
+        size_t head_end() const noexcept { return head_end_; }
+        size_t content_start() const noexcept { return content_start_; }
+        size_t content_end() const noexcept { return content_end_; }
+        size_t end() const noexcept { return end_; }
+
+        // ---- Mutations (copy-on-write) ----
+        void set_name(std::string new_name);
+        void set_content(std::string new_content);                           // replace text content
+        void set_content_writer(std::function<void(std::ostream &)> writer); // stream replacement
+        bool set_attr(std::string_view key, std::string new_value);          // returns false if not found
+        void add_attr(std::string name, std::string value);                  // adds a brand-new attribute
+
+        void add_child(std::shared_ptr<xmlNode> child, bool add_nl = false);
+
+        void write(std::ostream &os) const; // zero-copy where possible
+        void write(std::string &out) const; // appends into out
+
+        size_t n_children() const noexcept { return children_.size(); }
+
+        bool has_child(std::string_view name) const noexcept;
+        std::shared_ptr<xmlNode> get_child(std::string_view name) const noexcept; // first matching child or nullptr
+
+        std::vector<std::shared_ptr<xmlNode>> get_children(std::string_view name) const noexcept; // all matching children
+
+        std::shared_ptr<xmlNode> deep_copy() const;
+
+        // Remove/suppress a child so it won’t be written (but offsets still used to skip its original bytes)
+        bool remove_child(size_t index) noexcept;
+        bool remove_child(const xmlNode *child) noexcept;
+        bool remove_child(std::string_view name) noexcept;
+
+        // Insert new children relative to existing ones
+        bool insert_child_before(size_t anchor_in_doc_ordinal, std::shared_ptr<xmlNode> child) noexcept;
+        bool insert_child_after(size_t anchor_in_doc_ordinal, std::shared_ptr<xmlNode> child) noexcept;
+
+        // Replace an existing child at same spot (suppresses old, inserts new before it)
+        bool replace_child(size_t anchor_in_doc_ordinal, std::shared_ptr<xmlNode> child) noexcept;
+        bool replace_child(std::string_view anchor_name, std::shared_ptr<xmlNode> child, bool add_nl = false) noexcept;
+
+        // Insert at specific location in this node's content
+        // (rel_offset is in bytes relative to content_start(); clamped to [0, content_len]).
+        bool insert_child_at_content_offset(size_t rel_offset, std::shared_ptr<xmlNode> child) noexcept;
+
+        // Convenience: explicit start/end of content
+        bool insert_child_at_start(std::shared_ptr<xmlNode> child) noexcept;
+        bool insert_child_at_end(std::shared_ptr<xmlNode> child) noexcept;
+
+        // Hints to add newlines before starting or ending nodes
+        bool append_nl_start = false;
+        bool append_nl_end = false;
+
+    private:
+        // Internal constructor used by parser.
+        explicit xmlNode(std::shared_ptr<const std::string> doc);
+
+        // Recursive element parser. Expects 'pos' at '<' for a normal element.
+        static std::shared_ptr<xmlNode> parse_element(const std::shared_ptr<const std::string> &doc, size_t &pos);
+
+        // Top-level scanner to locate the first element (skips XML decl, comments, etc.).
+        static size_t find_first_element_start(const std::string &s, size_t pos);
+
+        // Helpers for parsing attributes and skipping markup we don't turn into nodes.
+        static void parse_attributes(xmlNode &node, size_t &cur);
+        static bool skip_comment(const std::string &s, size_t &pos); // <!-- ... -->
+        static bool skip_pi(const std::string &s, size_t &pos);      // <? ... ?>
+        static bool skip_doctype(const std::string &s, size_t &pos); // <!DOCTYPE ...>
+        static bool skip_cdata(const std::string &s, size_t &pos);   // <![CDATA[ ... ]]>
+
+        // Writer helpers
+        void write_start_tag(std::ostream &os) const;
+        void write_end_tag(std::ostream &os) const;
+        bool modified_header() const noexcept;
+        bool modified_footer() const noexcept;
+
+        std::shared_ptr<const std::string> doc_{};
+
+        // Offsets into *doc_:
+        size_t start_ = npos;
+        size_t head_end_ = npos;
+        size_t content_start_ = npos;
+        size_t content_end_ = npos;
+        size_t end_ = npos;
+
+        size_t prolog_start_ = 0; // usually 0
+        size_t prolog_end_ = 0;   // byte offset of first '<' of the root element
+
+        // Child nodes may be “suppressed” for serialization by a parent;
+        // this flag is *only* consulted by the parent’s writer loop.
+        bool suppressed_ = false;
+
+        bool self_closing_ = false;
+
+        // Internal: write with context (as a root or embedded as a child)
+        void write_impl(std::ostream &os, bool as_child) const;
+
+        const xmlNode *nth_in_doc_child(size_t ordinal) const noexcept;
+
+        // Placement hints for synthetic (added) children; ownership stays in children_
+        struct InsertHint
+        {
+            enum class Where
+            {
+                Before,
+                After,
+                AtAbs,
+                AtStart,
+                AtEnd
+            };
+            Where where;
+            const xmlNode *anchor = nullptr; // for Before/After
+            size_t abs = 0;                  // absolute byte offset in this->doc_ for AtAbs
+            const xmlNode *node = nullptr;   // child to write (owned by children_)
+        };
+        std::vector<InsertHint> inserts_;
+
+        std::string_view name_view_{};
+        std::optional<std::string> name_new_{};
+        std::optional<std::string> content_new_{};
+        std::function<void(std::ostream &)> content_writer_{};
+        std::vector<Attr> attrs_{};
+        std::vector<std::shared_ptr<xmlNode>> children_{};
+        bool modified_ = false;
+    };
+
+    // Helpers for lheReader
+    namespace lhe_build_detail
+    {
+
+        template <class T>
+        inline constexpr bool is_shared_event_v =
+            std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::shared_ptr<event>>;
+
+        template <class T>
+        inline constexpr bool is_unique_event_v =
+            std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::unique_ptr<event>>;
+
+        template <class T>
+        inline constexpr bool is_event_object_v =
+            std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, event> ||
+            std::is_base_of_v<event, std::remove_cv_t<std::remove_reference_t<T>>>;
+
+        // Convert {event / unique_ptr / shared_ptr} -> shared_ptr<event>
+        template <class U>
+        std::shared_ptr<event> to_shared_event(U &&u)
+        {
+            if constexpr (is_shared_event_v<U>)
+            {
+                return std::forward<U>(u);
+            }
+            else if constexpr (is_unique_event_v<U>)
+            {
+                // move out of unique_ptr into shared_ptr
+                auto *raw = std::forward<U>(u).release();
+                return std::shared_ptr<event>(raw);
+            }
+            else if constexpr (is_event_object_v<U>)
+            {
+                using Decayed = std::remove_cv_t<std::remove_reference_t<U>>;
+                if constexpr (std::is_same_v<Decayed, event>)
+                {
+                    return std::make_shared<event>(std::forward<U>(u));
+                }
+                else
+                {
+                    // Derived from event
+                    return std::make_shared<Decayed>(std::forward<U>(u));
+                }
+            }
+            else
+            {
+                static_assert(is_shared_event_v<U> || is_unique_event_v<U> || is_event_object_v<U>,
+                              "Event translator must return event, unique_ptr<event>, or shared_ptr<event>.");
+                return {}; // unreachable
+            }
+        }
+
+    } // namespace lhe_build_detail
+
+    // lheReader: Builds LHE (Les Houches Event) files from raw inputs
+    // takes as input an initNode reader, an event reader, and an optional header reader
+    template <class InitRaw, class EventRaw, class HeaderRaw = std::monostate>
+    class lheReader
+    {
+    public:
+        using InitTx = std::function<initNode(const InitRaw &)>;
+        using EventTx = std::function<std::shared_ptr<event>(const EventRaw &)>;
+        using HeaderTx = std::function<std::any(const HeaderRaw &)>;
+
+        lheReader() = default;
+        lheReader(const lheReader &) = default;
+        lheReader(lheReader &&) = default;
+        lheReader &operator=(const lheReader &) = default;
+        lheReader &operator=(lheReader &&) = default;
+        template <class InitTx, class EventTx>
+        explicit lheReader(InitTx init_tx, EventTx event_tx)
+        {
+            set_init_translator(std::move(init_tx));
+            set_event_translator(std::move(event_tx));
+        }
+
+        template <class InitTx, class EventTx, class HeaderTx>
+        lheReader(InitTx init_tx, EventTx event_tx, HeaderTx header_tx)
+        {
+            set_init_translator(std::move(init_tx));
+            set_event_translator(std::move(event_tx));
+            set_header_translator(std::move(header_tx));
+        }
+
+        lheReader &set_init_translator(InitTx tx)
+        {
+            init_tx_ = std::move(tx);
+            return *this;
+        }
+
+        template <class F>
+        lheReader &set_event_translator(F &&f)
+        {
+            // Accepts any callable returning event-like; adapt to shared_ptr<event>.
+            event_tx_ = [fn = std::forward<F>(f)](const EventRaw &r) -> std::shared_ptr<event>
+            {
+                auto out = fn(r);
+                return lhe_build_detail::to_shared_event(std::move(out));
+            };
+            return *this;
+        }
+
+        lheReader &set_header_translator(HeaderTx tx)
+        {
+            header_tx_ = std::move(tx);
+            return *this;
+        }
+
+        lheReader &set_filter(bool v)
+        {
+            filter_processes_ = v;
+            return *this;
+        }
+
+        // Read from raw inputs
+        template <class EventRange>
+        lhe read(const InitRaw &init_raw,
+                 const EventRange &events_raw,
+                 std::optional<HeaderRaw> header_raw = std::nullopt) const
+        {
+            ensure_ready_();
+
+            // 1) Read initNode and validate
+            initNode init = init_tx_(init_raw);
+            init.validate_init();
+
+            // 2) Translate events -> vector<shared_ptr<event>>
+            std::vector<std::shared_ptr<event>> evts;
+            if constexpr (has_size_v<EventRange>)
+            {
+                evts.reserve(events_raw.size());
+            }
+            for (const auto &er : events_raw)
+            {
+                evts.emplace_back(event_tx_(er));
+                if (!evts.back())
+                {
+                    throw std::runtime_error("event translator produced null shared_ptr<event>.");
+                }
+            }
+
+            // 3) Construct lhe with init + events
+            lhe out(init, std::move(evts));
+
+            // 4) Optional header
+            if (header_raw.has_value())
+            {
+                if (!header_tx_)
+                {
+                    throw std::runtime_error("Header provided but no header translator configured.");
+                }
+                out.set_header(header_tx_.value()(header_raw.value()));
+            }
+
+            out.set_filter(filter_processes_);
+            return out;
+        }
+
+    private:
+        void ensure_ready_() const
+        {
+            if (!init_tx_)
+                throw std::runtime_error("init translator not set.");
+            if (!event_tx_)
+                throw std::runtime_error("event translator not set.");
+        }
+
+        // Translators
+        InitTx init_tx_;
+        EventTx event_tx_;
+        std::optional<HeaderTx> header_tx_;
+
+        bool filter_processes_ = false;
+    };
+
+    template <class InitRaw,
+              class EventRange,
+              class HeaderRaw = std::monostate,
+              class InitTx,
+              class EventTx,
+              class HeaderTx = std::nullptr_t>
+    lhe read_lhe(const InitRaw &init_raw,
+                 const EventRange &events_raw,
+                 InitTx init_tx,
+                 EventTx event_tx,
+                 std::optional<HeaderRaw> header_raw = std::nullopt,
+                 HeaderTx header_tx = nullptr,
+                 bool filter_processes = false)
+    {
+        using EventRawT = typename std::decay<decltype(*std::begin(events_raw))>::type;
+
+        lheReader<InitRaw, EventRawT, HeaderRaw> b;
+        b.set_init_translator(std::move(init_tx))
+            .set_event_translator(std::move(event_tx))
+            .set_filter(filter_processes);
+
+        if constexpr (!std::is_same_v<HeaderTx, std::nullptr_t>)
+        {
+            if constexpr (std::is_pointer_v<std::decay_t<HeaderTx>>)
+            {
+                if (header_tx)
+                    b.set_header_translator(header_tx);
+            }
+            else
+            {
+                b.set_header_translator(std::move(header_tx));
+            }
+        }
+        return b.read(init_raw, events_raw, header_raw);
+    }
+
+    struct ignore_header_t
+    {
+        std::optional<std::monostate> operator()(const std::any &) const noexcept
+        {
+            return std::nullopt;
+        }
+    };
+
+    // output container post writing to user-defined class
+    template <class RawInit, class RawEvent, class RawHeaderOpt>
+    struct lheRaw
+    {
+        using init_type = RawInit;
+        using event_type = RawEvent;
+        using header_opt = RawHeaderOpt;
+
+        RawInit init;
+        std::vector<RawEvent> events;
+        RawHeaderOpt header;
+    };
+
+    // lheWriter: Takes user-supplied constructors and applies them to
+    // the lhe object and returns an lheRaw container from it
+    template <
+        class InitRaw,
+        class EventRaw,
+        class HeaderRaw = std::monostate>
+    class lheWriter
+    {
+    public:
+        using InitTx = std::function<InitRaw(const initNode &)>;
+        using EventTx = std::function<EventRaw(event &)>;
+        using HeaderTx = std::function<HeaderRaw(const std::any &)>;
+        using result_t = lheRaw<InitRaw, EventRaw, HeaderRaw>;
+
+        lheWriter(InitTx init_tx, EventTx event_tx, HeaderTx header_tx = HeaderTx{})
+            : init_fn_(std::move(init_tx)), event_fn_(std::move(event_tx)), header_fn_(std::move(header_tx)) {}
+
+        lheWriter &set_init_translator(InitTx tx)
+        {
+            init_fn_ = std::move(tx);
+            return *this;
+        }
+
+        lheWriter &set_event_translator(EventTx tx)
+        {
+            event_fn_ = std::move(tx);
+            return *this;
+        }
+
+        lheWriter &set_header_translator(HeaderTx tx)
+        {
+            header_fn_ = std::move(tx);
+            return *this;
+        }
+
+        result_t to_raw(const lhe &doc) const
+        {
+            result_t out;
+
+            if (doc.header.has_value())
+            {
+                if (header_fn_)
+                    out.header = header_fn_(doc.header);
+                else
+                    warning("lheWriter::to_raw(): header present but no header translator configured; ignoring.");
+            }
+
+            out.init = init_fn_(static_cast<const initNode &>(doc));
+
+            out.events.reserve(doc.events.size());
+            for (const auto &pevt : doc.events)
+                if (pevt)
+                    out.events.push_back(event_fn_(*pevt));
+
+            return out;
+        }
+
+        // Convenience: build raw, then let any writer consume it (kept separate by design)
+        template <class OS, class Writer>
+        void write(const lhe &doc, OS &os, Writer &&writer) const
+        {
+            auto raw = to_raw(doc);
+            std::forward<Writer>(writer)(os, raw);
+        }
+
+    private:
+        InitTx init_fn_;
+        EventTx event_fn_;
+        HeaderTx header_fn_;
+    };
+
+    template <class InitRaw,
+              class EventRange,
+              class EventRawT = typename std::decay<decltype(*std::begin(std::declval<EventRange>()))>::type,
+              class HeaderRaw = std::monostate,
+              class InitTx,
+              class EventTx,
+              class HeaderTx = std::nullptr_t>
+    lheRaw<InitRaw, EventRawT, HeaderRaw> write_lhe(lhe &doc,
+                                                    InitTx init_tx,
+                                                    EventTx event_tx,
+                                                    HeaderTx header_tx = nullptr)
+    {
+        // using EventRawT = typename std::decay<decltype(*std::begin(event_tx(doc.events[0])))>::type;
+
+        lheWriter<InitRaw, EventRawT, HeaderRaw> w(std::move(init_tx), std::move(event_tx));
+        if constexpr (!std::is_same_v<HeaderTx, std::nullptr_t>)
+        {
+            w.set_header_translator(std::move(header_tx));
+        }
+        return w.to_raw(doc);
+    }
+
+    std::shared_ptr<event> string_to_event(std::string_view content);
+    std::shared_ptr<event> xml_to_event(std::shared_ptr<xmlNode> node);
+    initNode string_to_init(std::string_view content);
+    initNode xml_to_init(std::shared_ptr<xmlNode> node);
+    std::any xml_to_any(std::shared_ptr<xmlNode> node);
+    template <typename T>
+    std::any to_any(T &&value)
+    {
+        return std::make_any<std::decay_t<T>>(std::forward<T>(value));
+    }
+
+    std::shared_ptr<xmlNode> init_to_xml(const initNode &);
+    std::shared_ptr<xmlNode> event_to_xml(event &);
+    std::optional<std::shared_ptr<xmlNode>> header_to_xml(const std::any &);
+
+    using XmlToInitFn = initNode (*)(std::shared_ptr<xmlNode>);
+    using XmlToEventFn = std::shared_ptr<event> (*)(std::shared_ptr<xmlNode>);
+    using XmlToHeaderFn = std::any (*)(std::shared_ptr<xmlNode>);
+    using InitToXmlFn = std::shared_ptr<xmlNode> (*)(const initNode &);
+    using EventToXmlFn = std::shared_ptr<xmlNode> (*)(event &);
+    using HeaderToXmlFn = std::optional<std::shared_ptr<xmlNode>> (*)(const std::any &);
+
+    using xmlReader = lheReader<std::shared_ptr<xmlNode>,
+                                std::shared_ptr<xmlNode>,
+                                std::shared_ptr<xmlNode>>;
+    using xmlWriter = lheWriter<
+        std::shared_ptr<xmlNode>,               // InitRaw
+        std::shared_ptr<xmlNode>,               // EventRaw
+        std::optional<std::shared_ptr<xmlNode>> // HeaderRaw (must be optional<...>)
+        >;
+
+    using xmlRaw = lheRaw<
+        std::shared_ptr<xmlNode>,               // RawInit
+        std::shared_ptr<xmlNode>,               // RawEvent
+        std::optional<std::shared_ptr<xmlNode>> // RawHeaderOpt
+        >;
+
+    // Accessor to prebuilt XML instance
+    const xmlReader &xml_reader();
+    const xmlWriter &xml_writer();
+
+    // Convenience wrapper: translate to the Raw structure
+    xmlRaw to_xml_raw(const lhe &doc);
+    extern template class lheWriter<
+        std::shared_ptr<xmlNode>,
+        std::shared_ptr<xmlNode>,
+        std::optional<std::shared_ptr<xmlNode>>>;
+    template class lheWriter<
+        std::shared_ptr<xmlNode>,
+        std::shared_ptr<xmlNode>,
+        std::optional<std::shared_ptr<xmlNode>>>;
+
+    lhe to_lhe(std::shared_ptr<xmlNode> node);
+    lhe to_lhe(const std::string &xml);
+    lhe load_lhef(std::istream &in);
+    lhe load_lhef(const std::string &filename);
+    void write_lhef(lhe &doc, std::ostream &out = std::cout, bool include_ids = false);
+    void write_lhef(lhe &doc, const std::string &filename, bool include_ids = false);
+    std::shared_ptr<xmlNode> to_xml(xmlRaw &raw);
+    std::shared_ptr<xmlNode> to_xml(const lhe &doc);
+    std::shared_ptr<xmlNode> load_xml(const std::string &filename);
+
+    /// Minimal SLHA container: BLOCK entries with integer indices -> double values,
+    /// and DECAY widths keyed by PDG id. Comments are discarded.
+    class slha
+    {
+    public:
+        void read(std::istream &in);
+        static slha parse(std::istream &in)
+        {
+            slha s;
+            s.read(in);
+            return s;
+        }
+
+        void write(std::ostream &out = std::cout,
+                   int value_precision = 6,
+                   bool scientific = true,
+                   const std::string &indent = "      ") const;
+
+        // Get with fallback
+        double get(const std::string &block,
+                   std::initializer_list<int> indices,
+                   double fallback = 0.0) const;
+        double get(const std::string &block,
+                   int index,
+                   double fallback = 0.0) const;
+
+        // Set
+        void set(const std::string &block,
+                 std::initializer_list<int> indices,
+                 double value);
+        void set(const std::string &block,
+                 int index,
+                 double value);
+
+        // -------- DECAY: Get / Set --------
+        // With fallback
+        double get_decay(int pid, double fallback = 0.0) const;
+        void set_decay(int pid, double width);
+
+        // Introspection
+        bool has_block(const std::string &block) const;
+        bool has_entry(const std::string &block, std::initializer_list<int> indices) const;
+
+    private:
+        struct VecLess
+        {
+            bool operator()(const std::vector<int> &a, const std::vector<int> &b) const
+            {
+                return a < b; // lexicographic
+            }
+        };
+        struct BlockData
+        {
+            std::map<std::vector<int>, double, VecLess> entries;
+        };
+
+        std::map<std::string, BlockData> blocks_; // UPPER block -> data
+        std::map<int, double> decays_;            // pid -> width
+
+        static std::string trim(const std::string &s);
+        static bool starts_with_ci(const std::string &s, const char *prefix);
+        static std::string upper(std::string s);
+
+        static std::string indices_to_string(std::initializer_list<int> indices);
+    };
+
+    slha to_slha(const std::string &slha_text);
+    slha to_slha(std::istream &slha_stream);
+    slha to_slha(std::shared_ptr<xmlNode> node);
+    slha to_slha(const lhe &doc);
+    slha load_slha(const std::string &filename);
+
+} // namespace REX
+
+#endif // DEFINING _REX_H_ FUNCTIONALITY
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_driver.mk b/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_driver.mk
new file mode 100644
index 0000000000..ba56c9509a
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_driver.mk
@@ -0,0 +1,878 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
+
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+
+#-------------------------------------------------------------------------------
+
+#=== Include cudacpp_config.mk
+
+# Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported (and configure defaults if no user-defined choices exist)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing.
+# Determine CUDACPP_BUILDDIR from a DIRTAG based on BACKEND, FPTYPE, HELINL, HRDCOD and from the user-defined choice of USEBUILDDIR
+include ../src/cudacpp_config.mk
+
+# Export CUDACPP_BUILDDIR (so that there is no need to check/define it again in cudacpp_src.mk)
+export CUDACPP_BUILDDIR
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname)
+
+# Detect O/S kernel (Linux, Darwin...)
+UNAME_S := $(shell uname -s)
+###$(info UNAME_S='$(UNAME_S)')
+
+# Detect architecture (x86_64, ppc64le...)
+UNAME_P := $(shell uname -p)
+###$(info UNAME_P='$(UNAME_P)')
+
+#-------------------------------------------------------------------------------
+
+#=== Include the common MG5aMC Makefile options
+
+# OM: including make_opts is crucial for MG5aMC flag consistency/documentation
+# AV: disable the inclusion of make_opts if the file has not been generated (standalone cudacpp)
+ifneq ($(wildcard ../Source/make_opts),)
+  include ../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Redefine BACKEND if the current value is 'cppauto'
+
+# Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+ifeq ($(BACKEND),cppauto)
+  ifeq ($(UNAME_P),ppc64le)
+    override BACKEND = cppsse4
+  else ifeq ($(UNAME_P),arm)
+    override BACKEND = cppsse4
+  else ifeq ($(wildcard /proc/cpuinfo),)
+    override BACKEND = cppnone
+    ###$(warning Using BACKEND='$(BACKEND)' because host SIMD features cannot be read from /proc/cpuinfo)
+  else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
+    override BACKEND = cpp512y
+  else ifeq ($(shell grep -m1 -c avx2 /proc/cpuinfo),1)
+    override BACKEND = cppavx2
+    ###ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
+    ###  $(warning Using BACKEND='$(BACKEND)' because host does not support avx512vl)
+    ###else
+    ###  $(warning Using BACKEND='$(BACKEND)' because this is faster than avx512vl for clang)
+    ###endif
+  else ifeq ($(shell grep -m1 -c sse4_2 /proc/cpuinfo),1)
+    override BACKEND = cppsse4
+  else
+    override BACKEND = cppnone
+  endif
+  $(info BACKEND=$(BACKEND) (was cppauto))
+else
+  $(info BACKEND='$(BACKEND)')
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the C++ compiler
+
+CXXFLAGS = $(OPTFLAGS) -std=c++17 -Wall -Wshadow -Wextra
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+  CXXFLAGS += -ffast-math # see issue #117
+endif
+###CXXFLAGS+= -Ofast # performance is not different from --fast-math
+###CXXFLAGS+= -g # FOR DEBUGGING ONLY
+
+# Optionally add debug flags to display the full list of flags (eg on Darwin)
+###CXXFLAGS+= -v
+
+# Note: AR, CXX and FC are implicitly defined if not set externally
+# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
+
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+  CXXFLAGS += -mmacosx-version-min=11.3
+endif
+
+# Export CXXFLAGS (so that there is no need to check/define it again in cudacpp_src.mk)
+export CXXFLAGS
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the GPU compiler (CUDA or HIP)
+#=== (note, this is done also for C++, as NVTX and CURAND/ROCRAND are also needed by the C++ backends)
+
+# Set CUDA_HOME from the path to nvcc, if it exists
+#override CUDA_HOME = $(patsubst %%/bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
+CUDA_HOME := $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+
+# Set HIP_HOME from the path to hipcc, if it exists
+override HIP_HOME = $(patsubst %%/bin/hipcc,%%,$(shell which hipcc 2>/dev/null))
+
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
+# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
+ifneq ($(CUDA_HOME),)
+  USE_NVTX ?=-DUSE_NVTX
+  CUDA_INC = -I$(CUDA_HOME)/include/
+else
+  override USE_NVTX=
+  override CUDA_INC=
+endif
+
+# NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
+# - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
+# If both CUDA and HIP were installed, then CUDA took precedence over HIP, and the only way to force HIP builds was to disable
+# CUDA builds by setting CUDA_HOME to an invalid value (as CUDA_HOME took precdence over PATH to find the installation of nvcc).
+# Similarly, C++-only builds could be forced by setting CUDA_HOME and/or HIP_HOME to invalid values. A check for an invalid nvcc
+# in CUDA_HOME or an invalid hipcc HIP_HOME was necessary to ensure this logic, and had to be performed at the very beginning.
+# - In the new implementation (PR #798), separate individual builds are performed for one specific C++/AVX mode, for CUDA or
+# for HIP. The choice of the type of build is taken depending on the value of the BACKEND variable (replacing the AVX variable).
+# Unlike what happened in the past, nvcc and hipcc must have already been added to PATH. Using 'which nvcc' and 'which hipcc',
+# their existence and their location is checked, and the variables CUDA_HOME and HIP_HOME are internally set by this makefile.
+# This must be still done before backend-specific customizations, e.g. because CURAND and NVTX are also used in C++ builds.
+# Note also that a preliminary check for nvcc and hipcc if BACKEND is cuda or hip is performed in cudacpp_config.mk.
+# - Note also that the REQUIRE_CUDA variable (which was used in the past, e.g. for CI tests on GPU #443) is now (PR #798) no
+# longer necessary, as it is now equivalent to BACKEND=cuda. Similarly, there is no need to introduce a REQUIRE_HIP variable.
+
+#=== Configure the CUDA or HIP compiler (only for the CUDA and HIP backends)
+#=== (NB: throughout all makefiles, an empty GPUCC is used to indicate that this is a C++ build, i.e. that BACKEND is neither cuda nor hip!)
+
+ifeq ($(BACKEND),cuda)
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(error BACKEND=$(BACKEND) but CUDA builds are not supported for multi-word CXX "$(CXX)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc (it was already checked above that this exists)
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  XCOMPILERFLAG = -Xcompiler
+  GPULANGUAGE = cu
+  GPUSUFFIX = cuda
+
+  # Basic compiler flags (optimization and includes)
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), $(XCOMPILERFLAG) $(opt))
+
+  # NVidia CUDA architecture flags
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # This will embed device code for 70, and PTX for 70+.
+  # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  GPUFLAGS += $(GPUARCHFLAGS)
+
+  # Other NVidia-specific flags
+  CUDA_OPTFLAGS = -lineinfo
+  GPUFLAGS += $(CUDA_OPTFLAGS)
+
+  # NVCC version
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+
+  # Fast math
+  GPUFLAGS += -use_fast_math
+
+  # Extra build warnings
+  ###GPUFLAGS += $(XCOMPILERFLAG) -Wall $(XCOMPILERFLAG) -Wextra $(XCOMPILERFLAG) -Wshadow
+
+  # CUDA includes and NVTX
+  GPUFLAGS += $(CUDA_INC) $(USE_NVTX) 
+
+  # C++ standard
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+
+  # For nvcc, use -maxrregcount to control the maximum number of registries (this does not exist in hipcc)
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+  # Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+    GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(BACKEND),hip)
+
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  XCOMPILERFLAG =
+  GPULANGUAGE = hip
+  GPUSUFFIX = hip
+
+  # Basic compiler flags (optimization and includes)
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), $(XCOMPILERFLAG) $(opt))
+
+  # AMD HIP architecture flags
+  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUFLAGS += $(GPUARCHFLAGS)
+
+  # Other AMD-specific flags
+  GPUFLAGS += -target x86_64-linux-gnu -DHIP_PLATFORM=amd
+
+  # Fast math (is -DHIP_FAST_MATH equivalent to -ffast-math?)
+  GPUFLAGS += -DHIP_FAST_MATH
+
+  # Extra build warnings
+  ###GPUFLAGS += $(XCOMPILERFLAG) -Wall $(XCOMPILERFLAG) -Wextra $(XCOMPILERFLAG) -Wshadow
+
+  # HIP includes
+  HIP_INC = -I$(HIP_HOME)/include/
+  GPUFLAGS += $(HIP_INC)
+
+  # C++ standard
+  GPUFLAGS += -std=c++17
+
+else
+
+  # Backend is neither cuda nor hip
+  override GPUCC=
+  override GPUFLAGS=
+
+  # Sanity check, this should never happen: if GPUCC is empty, then this is a C++ build, i.e. BACKEND is neither cuda nor hip.
+  # In practice, in the following, "ifeq ($(GPUCC),)" is equivalent to "ifneq ($(findstring cpp,$(BACKEND)),)".
+  # Conversely, note that GPUFLAGS is non-empty also for C++ builds, but it is never used in that case.
+  ifeq ($(findstring cpp,$(BACKEND)),)
+    $(error INTERNAL ERROR! Unknown backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
+  endif
+
+endif
+
+# Export GPUCC, GPUFLAGS, GPULANGUAGE, GPUSUFFIX (these are needed by both src and rwgt_runners, but should not be overwritten there)
+export CUDA_HOME
+export GPUCC
+export GPUFLAGS
+export GPULANGUAGE
+export GPUSUFFIX
+export XCOMPILERFLAG
+
+#-------------------------------------------------------------------------------
+
+#=== Configure ccache for C++ and CUDA/HIP builds
+
+# Enable ccache if USECCACHE=1
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+  override CXX:=ccache $(CXX)
+endif
+#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
+#  override AR:=ccache $(AR)
+#endif
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
+
+INCFLAGS = -I.
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+
+# Dependency on src directory
+ifeq ($(GPUCC),)
+MG5AMC_COMMONLIB = mg5amc_common_cpp
+else
+MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX)
+endif
+LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lrex -ltearex
+INCFLAGS += -I../src
+
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+  override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+  override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+  override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+  override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+
+# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk)
+export CXXNAMESUFFIX
+
+#-------------------------------------------------------------------------------
+
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
+
+# PowerPC-specific CXX compiler flags (being reviewed)
+ifeq ($(UNAME_P),ppc64le)
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for cppnone and cppsse4
+  # Throughput references without the extra flags below: cppnone=1.41-1.42E6, cppsse4=2.15-2.19E6
+  ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
+  ###CXXFLAGS+= -fpeel-loops # no change
+  ###CXXFLAGS+= -funroll-loops # gains ~1%% for cppnone, loses ~1%% for cppsse4
+  ###CXXFLAGS+= -ftree-vectorize # no change
+  ###CXXFLAGS+= -flto # would increase to cppnone=4.08-4.12E6, cppsse4=4.99-5.03E6!
+else
+  ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4...
+  ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
+endif
+
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
+ifeq ($(UNAME_P),ppc64le)
+  GPUFLAGS+= $(XCOMPILERFLAG) -mno-float128
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults for OMPFLAGS
+
+# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ###$(info USEOPENMP==1: will build with OpenMP if possible)
+  ifneq ($(findstring hipcc,$(GPUCC)),)
+    override OMPFLAGS = # disable OpenMP MT when using hipcc #802
+  else ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+  else ifneq ($(shell $(CXX) --version | egrep '^clang version 16'),)
+    ###override OMPFLAGS = # disable OpenMP on clang16 #904
+    $(error OpenMP is not supported by cudacpp on clang16 - issue #904)
+  else ifneq ($(shell $(CXX) --version | egrep '^clang version 17'),)
+    ###override OMPFLAGS = # disable OpenMP on clang17 #904
+    $(error OpenMP is not supported by cudacpp on clang17 - issue #904)
+  else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
+    override OMPFLAGS = -fopenmp
+    ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+  ###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+  else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+    override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+    ###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
+  else
+    override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+    ###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
+  endif
+else
+  ###$(info USEOPENMP!=1: will build without OpenMP)
+  override OMPFLAGS =
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND
+
+# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out)
+###$(info RNDGEN=$(RNDGEN))
+ifneq ($(RNDGEN),)
+  $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!)
+  ifeq ($(RNDGEN),hasCurand)
+    override HASCURAND = $(RNDGEN)
+  else ifeq ($(RNDGEN),hasNoCurand)
+    override HASCURAND = $(RNDGEN)
+  else ifneq ($(RNDGEN),hasNoCurand)
+    $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!)
+  endif
+endif
+
+# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND
+# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
+ifeq ($(HASCURAND),)
+  ifeq ($(GPUCC),) # CPU-only build
+    ifneq ($(CUDA_HOME),)
+      # By default, assume that curand is installed if a CUDA installation exists
+      override HASCURAND = hasCurand
+    else
+      override HASCURAND = hasNoCurand
+    endif
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override HASCURAND = hasCurand
+  else # non-Nvidia GPU build
+    override HASCURAND = hasNoCurand
+  endif
+endif
+
+# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND
+# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...)
+ifeq ($(HASHIPRAND),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASHIPRAND = hasNoHiprand
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override HASHIPRAND = hasHiprand
+  else # non-AMD GPU build
+    override HASHIPRAND = hasNoHiprand
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
+
+# Set the build flags appropriate to OMPFLAGS
+$(info OMPFLAGS=$(OMPFLAGS))
+CXXFLAGS += $(OMPFLAGS)
+
+# Set the build flags appropriate to each BACKEND choice (example: "make BACKEND=cppnone")
+# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
+# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
+ifeq ($(UNAME_P),ppc64le)
+  ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers)
+  else ifeq ($(BACKEND),cppavx2)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  else ifeq ($(BACKEND),cpp512y)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  else ifeq ($(BACKEND),cpp512z)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  endif
+else ifeq ($(UNAME_P),arm)
+  ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers)
+  else ifeq ($(BACKEND),cppavx2)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  else ifeq ($(BACKEND),cpp512y)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  else ifeq ($(BACKEND),cpp512z)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  endif
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+  ifeq ($(BACKEND),cppnone)
+    override AVXFLAGS = -mno-sse3 # no SIMD
+  else ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(BACKEND),cppavx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(BACKEND),cpp512y)
+    override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(BACKEND),cpp512z)
+    override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  endif
+else
+  ifeq ($(BACKEND),cppnone)
+    override AVXFLAGS = -march=x86-64 # no SIMD (see #588)
+  else ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(BACKEND),cppavx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(BACKEND),cpp512y)
+    override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(BACKEND),cpp512z)
+    override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  endif
+endif
+# For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
+ifeq ($(GPUCC),)
+  CXXFLAGS+= $(AVXFLAGS)
+endif
+
+# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
+$(info FPTYPE='$(FPTYPE)')
+ifeq ($(FPTYPE),d)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+else ifeq ($(FPTYPE),f)
+  CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+else ifeq ($(FPTYPE),m)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+else
+  $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
+endif
+
+# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1")
+$(info HELINL='$(HELINL)')
+ifeq ($(HELINL),1)
+  CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS += -DMGONGPU_INLINE_HELAMPS
+else ifneq ($(HELINL),0)
+  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
+$(info HRDCOD='$(HRDCOD)')
+ifeq ($(HRDCOD),1)
+  CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS += -DMGONGPU_HARDCODE_PARAM
+else ifneq ($(HRDCOD),0)
+  $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
+endif
+
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND
+
+$(info HASCURAND=$(HASCURAND))
+$(info HASHIPRAND=$(HASHIPRAND))
+override RNDCXXFLAGS=
+override RNDLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand")
+ifeq ($(HASCURAND),hasNoCurand)
+  override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND
+else ifeq ($(HASCURAND),hasCurand)
+  override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+else
+  $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported)
+endif
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand")
+ifeq ($(HASHIPRAND),hasNoHiprand)
+  override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND
+else ifeq ($(HASHIPRAND),hasHiprand)
+  override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand
+else ifneq ($(HASHIPRAND),hasHiprand)
+  $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported)
+endif
+
+#$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
+#$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure Position-Independent Code
+CXXFLAGS += -fPIC
+GPUFLAGS += $(XCOMPILERFLAG) -fPIC
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed)
+# (Rationale: avoid mixing of builds with different random number generators)
+override TAG = $(patsubst cpp%%,%%,$(BACKEND))_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND)
+
+# Export TAG (so that there is no need to check/define it again in cudacpp_src.mk)
+export TAG
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+override BUILDDIR = $(CUDACPP_BUILDDIR)
+ifeq ($(USEBUILDDIR),1)
+  override LIBDIR = ../lib/$(BUILDDIR)
+  override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR == 1))
+else
+  override LIBDIR = ../lib
+  override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR != 1))
+endif
+###override INCDIR = ../../include
+###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override CXXLIBFLAGSRPATH =
+  override GPULIBFLAGSRPATH =
+  override CXXLIBFLAGSRPATH2 =
+  override GPULIBFLAGSRPATH2 =
+else
+  # RPATH to gpu/cpp libs when linking executables
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override GPULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  # RPATH to common lib when linking gpu/cpp libs
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override GPULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+endif
+
+# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
+override RUNTIME =
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+.PHONY: all $(DIRS)
+
+DIRS := $(wildcard P*)
+
+# Construct the library paths
+cxx_proclibs := $(shell for dir in $(DIRS); do basename $$dir | awk -F_ '{print "-l mg5amc_"$$(NF-1)"_"$$NF"_cpp"}'; done)
+gpu_proclibs := $(shell for dir in $(DIRS); do basename $$dir | awk -F_ '{print "-l mg5amc_"$$(NF-1)"_"$$NF"_$(GPUSUFFIX)"}'; done)
+
+ifeq ($(GPUCC),)
+  cxx_rwgt=$(BUILDDIR)/rwgt_driver_cpp.exe
+  rwgtlib := $(addprefix ,$(addsuffix /librwgt_cpp.so,$(DIRS)))
+else
+  gpu_rwgt=$(BUILDDIR)/rwgt_driver_gpu.exe
+  rwgtlib := $(addprefix ,$(addsuffix /librwgt_$(GPUSUFFIX).so,$(DIRS)))
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all.$(TAG)
+
+# First target (default goal)
+ifeq ($(GPUCC),)
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(rwgtlib) $(cxx_rwgt)
+else
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(rwgtlib) $(gpu_rwgt)
+endif
+
+# Target (and build options): debug
+MAKEDEBUG=
+debug: OPTFLAGS   = -g -O0
+debug: CUDA_OPTFLAGS = -G
+debug: MAKEDEBUG := debug
+debug: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+$(BUILDDIR)/.build.$(TAG):
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo "  $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+# # Apply special build flags only to check_sa_<cpp|$(GPUSUFFIX)>.o (NVTX in timermap.h, #679)
+$(BUILDDIR)/rwgt_driver_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+$(BUILDDIR)/rwgt_driver_gpu.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
+endif
+endif
+
+# Generic target and build rules: objects from C++ compilation
+# (NB do not include CUDA_INC here! add it only for NVTX or curand #679)
+$(BUILDDIR)/%%_cpp.o : %%.cc *.h ../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@
+
+# Generic target and build rules: objects from CUDA or HIP compilation
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h ../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): common (src) library
+commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../src/*.h ../src/*.cc $(BUILDDIR)/.build.$(TAG)
+	$(MAKE) -C ../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE)
+
+#-------------------------------------------------------------------------------
+
+#HERE LOOP MAKE OVER P DIRECTORIES AND ADD RWGT_RUNNER_LIBS
+# Ensure each librwgt.a depends on its directory being built
+$(rwgtlib): $(commonlib)
+	@$(MAKE) -C $(@D) VARIABLE=true
+
+# Target (and build rules): C++ and CUDA/HIP standalone executables
+$(cxx_rwgt): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cxx_rwgt): $(BUILDDIR)/rwgt_driver.o $(rwgtlib)
+	$(CXX) -o $@ $(BUILDDIR)/rwgt_driver.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) $(rwgtlib) 
+
+ifneq ($(GPUCC),)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_rwgt): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_rwgt): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(gpu_rwgt): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(gpu_rwgt): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_rwgt): $(BUILDDIR)/rwgt_driver.o $(rwgtlib)
+	$(GPUCC) -o $@ $(BUILDDIR)/rwgt_driver.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) $(rwgtlib)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: build all targets in all BACKEND modes (each BACKEND mode in a separate build directory)
+# Split the bldall target into separate targets to allow parallel 'make -j bldall' builds
+# (Obsolete hack, no longer needed as there is no INCDIR: add a fbridge.inc dependency to bldall, to ensure it is only copied once for all BACKEND modes)
+bldcuda:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda -f $(CUDACPP_MAKEFILE)
+
+bldhip:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip -f $(CUDACPP_MAKEFILE)
+
+bldnone:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone -f $(CUDACPP_MAKEFILE)
+
+bldsse4:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 -f $(CUDACPP_MAKEFILE)
+
+bldavx2:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 -f $(CUDACPP_MAKEFILE)
+
+bld512y:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y -f $(CUDACPP_MAKEFILE)
+
+bld512z:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
+
+ifeq ($(UNAME_P),ppc64le)
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
+bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
+bldavxs: bldnone bldsse4
+else
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavx2 bld512y bld512z
+bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(HIP_HOME),)
+ifneq ($(CUDA_HOME),)
+bldall: bldhip bldcuda bldavxs
+else
+bldall: bldhip bldavxs
+endif
+else
+ifneq ($(CUDA_HOME),)
+bldall: bldcuda bldavxs
+else
+bldall: bldavxs
+endif
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean clean-rwgtlib
+
+clean: clean-rwgtlib
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
+	rm -f $(LIBDIR)/lib*.so
+endif
+	$(MAKE) -C ../src clean -f $(CUDACPP_SRC_MAKEFILE)
+###	rm -rf $(INCDIR)
+
+clean-rwgtlib:
+	@for dir in $(DIRS); do $(MAKE) -C $$dir clean; done
+
+cleanall:
+	@echo
+	$(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE)
+	@echo
+	$(MAKE) USEBUILDDIR=0 -C ../src cleanall -f $(CUDACPP_SRC_MAKEFILE)
+	rm -rf build.*
+
+# Target: clean the builds as well as the gtest installation(s)
+distclean: cleanall
+ifneq ($(wildcard $(TESTDIRCOMMON)),)
+	$(MAKE) -C $(TESTDIRCOMMON) clean
+endif
+	$(MAKE) -C $(TESTDIRLOCAL) clean
+
+#-------------------------------------------------------------------------------
+
+# Target: show system and compiler information
+info:
+	@echo ""
+	@uname -spn # e.g. Linux nodename.cern.ch x86_64
+ifeq ($(UNAME_S),Darwin)
+	@sysctl -a | grep -i brand
+	@sysctl -a | grep machdep.cpu | grep features || true
+	@sysctl -a | grep hw.physicalcpu:
+	@sysctl -a | grep hw.logicalcpu:
+else
+	@cat /proc/cpuinfo | grep "model name" | sort -u
+	@cat /proc/cpuinfo | grep "flags" | sort -u
+	@cat /proc/cpuinfo | grep "cpu cores" | sort -u
+	@cat /proc/cpuinfo | grep "physical id" | sort -u
+endif
+	@echo ""
+ifneq ($(shell which nvidia-smi 2>/dev/null),)
+	nvidia-smi -L
+	@echo ""
+endif
+	@echo USECCACHE=$(USECCACHE)
+ifeq ($(USECCACHE),1)
+	ccache --version | head -1
+endif
+	@echo ""
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
+endif
+	@echo ""
+	@echo CXX=$(CXX)
+ifneq ($(shell $(CXX) --version | grep ^clang),)
+	@echo $(CXX) -v
+	@$(CXX) -v |& egrep -v '(Found|multilib)'
+	@readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}'
+else
+	$(CXX) --version
+endif
+	@echo ""
+	@echo FC=$(FC)
+	$(FC) --version
+
+#-------------------------------------------------------------------------------
+
+# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe)
+# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!]
+# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe']
+test: runTest cmpFcheck
+
+# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe)
+runTest: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/runTest_cpp.exe
+else
+	$(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe
+endif
+
+# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events)
+runCheck: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2
+else
+	$(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
+endif
+
+# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events)
+runFcheck: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2
+else
+	$(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2
+endif
+
+# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events)
+cmpFcheck: all.$(TAG)
+	@echo
+ifeq ($(GPUCC),)
+	@echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+else
+	@echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU)   = $${me1}"; echo "Avg ME (F77/GPU)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+endif
+
+# Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
+cuda-memcheck: all.$(TAG)
+	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
+
+#-------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_rex_src.mk b/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_rex_src.mk
new file mode 100644
index 0000000000..3cd2c2ec32
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_rex_src.mk
@@ -0,0 +1,205 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: assume that the same name (e.g. cudacpp.mk, Makefile...) is used in the Subprocess and src directories
+
+THISMK = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for CUDA and C++
+
+INCFLAGS = -I.
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the C++ compiler (note: CXXFLAGS has been exported from cudacpp.mk)
+
+###$(info CXXFLAGS=$(CXXFLAGS))
+
+# Note: AR, CXX and FC are implicitly defined if not set externally
+# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
+###RANLIB = ranlib
+
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS = -L. -lrex -ltearex -Wl,-rpath=.
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+  LDFLAGS += -mmacosx-version-min=11.3
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the GPU (CUDA or HIP) compiler (note: GPUCC including ccache, GPUFLAGS, GPULANGUAGE, GPUSUFFIX have been exported from cudacpp.mk)
+
+###$(info GPUCC=$(GPUCC))
+###$(info GPUFLAGS=$(GPUFLAGS))
+###$(info GPULANGUAGE=$(GPULANGUAGE))
+###$(info GPUSUFFIX=$(GPUSUFFIX))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure ccache for C++ builds (note: GPUCC has been exported from cudacpp.mk including ccache)
+
+# Enable ccache if USECCACHE=1
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+  override CXX:=ccache $(CXX)
+endif
+#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
+#  override AR:=ccache $(AR)
+#endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Use the build directory exported from cudacpp.mk
+###$(info CUDACPP_BUILDDIR=$(CUDACPP_BUILDDIR))
+
+# Use the build lockfile "full" tag exported from cudacpp.mk
+###$(info TAG=$(TAG))
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+###$(info Current directory is $(shell pwd))
+override BUILDDIR = $(CUDACPP_BUILDDIR)
+ifeq ($(USEBUILDDIR),1)
+  override LIBDIRREL = ../lib/$(BUILDDIR)
+  ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR=1 is set))
+else
+  override LIBDIRREL = ../lib
+  ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set))
+endif
+######$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# Workaround for Mac #375 (I did not manage to fix rpath with @executable_path): use absolute paths for LIBDIR
+# (NB: this is quite ugly because it creates the directory if it does not exist - to avoid removing src by mistake)
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+  override LIBDIR = $(shell mkdir -p $(LIBDIRREL); cd $(LIBDIRREL); pwd)
+  ifeq ($(wildcard $(LIBDIR)),)
+    $(error Directory LIBDIR="$(LIBDIR)" should have been created by now)
+  endif
+else
+  override LIBDIR = $(LIBDIRREL)
+endif
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+# NB1: there are no CUDA targets in src as we avoid RDC!
+# NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src!
+
+ifeq ($(GPUCC),)
+MG5AMC_COMMONLIB = mg5amc_common_cpp
+else
+MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX)
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all.$(TAG)
+
+# First target (default goal)
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(LIBDIR)/librex.so $(LIBDIR)/libtearex.so
+
+# Target (and build options): debug
+debug: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+override oldtagsl=`if [ -d $(LIBDIR) ]; then find $(LIBDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+
+$(BUILDDIR)/.build.$(TAG): $(LIBDIR)/.build.$(TAG)
+
+$(LIBDIR)/.build.$(TAG):
+	@if [ "$(oldtagsl)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(LIBDIR) for other tags:\n$(oldtagsl)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make cleanall'"; exit 1; fi
+	@if [ "$(oldtagsb)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(BUILDDIR) for other tags:\n$(oldtagsb)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make cleanall'"; exit 1; fi
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	@touch $(LIBDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+#-------------------------------------------------------------------------------
+
+# Generic target and build rules: objects from C++ compilation
+$(BUILDDIR)/%%_cpp.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@
+
+# Generic target and build rules: objects from CUDA compilation
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
+endif
+
+#-------------------------------------------------------------------------------
+
+cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o)
+cxx_objects+=$(addprefix $(BUILDDIR)/, rwgt_instance_cpp.o)
+ifeq ($(GPUCC),)
+  cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_cpp.o)
+else
+  gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_$(GPUSUFFIX).o)
+endif
+
+# Target (and build rules): common (src) library
+ifeq ($(GPUCC),)
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
+else
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects)
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Atomic copy helper macro:
+#   1) copy to unique temp file next to the destination
+#   2) atomically rename into place
+# This is safe under parallel make and concurrent invocations.
+define ATOMIC_COPY
+	@tmp="$$(mktemp "$@.tmp.XXXXXX")"; \
+	cp -f "$(firstword $^)" "$$tmp"; \
+	mv -f "$$tmp" "$@"
+endef
+
+# Rex and teaRex: copy .so from src to LIBDIR atomically
+$(LIBDIR)/librex.so : ../src/librex.so 
+	$(ATOMIC_COPY)
+
+$(LIBDIR)/libtearex.so : ../src/libtearex.so
+	$(ATOMIC_COPY)
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean
+
+clean:
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(LIBDIR)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(LIBDIR)/.build.* $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
+endif
+
+cleanall:
+	@echo
+	$(MAKE) clean -f $(THISMK)
+	@echo
+	rm -rf $(LIBDIR)/build.*
+	rm -rf build.*
+
+#-------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_runner.mk b/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_runner.mk
new file mode 100644
index 0000000000..6410e806d5
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/makefiles/cudacpp_runner.mk
@@ -0,0 +1,891 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
+
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+
+#-------------------------------------------------------------------------------
+
+#=== Include cudacpp_config.mk
+
+# Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported (and configure defaults if no user-defined choices exist)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing.
+# Determine CUDACPP_BUILDDIR from a DIRTAG based on BACKEND, FPTYPE, HELINL, HRDCOD and from the user-defined choice of USEBUILDDIR
+include ../../src/cudacpp_config.mk
+
+# Export CUDACPP_BUILDDIR (so that there is no need to check/define it again in cudacpp_src.mk)
+#export CUDACPP_BUILDDIR
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname)
+
+# Detect O/S kernel (Linux, Darwin...)
+UNAME_S := $(shell uname -s)
+###$(info UNAME_S='$(UNAME_S)')
+
+# Detect architecture (x86_64, ppc64le...)
+UNAME_P := $(shell uname -p)
+###$(info UNAME_P='$(UNAME_P)')
+
+#-------------------------------------------------------------------------------
+
+#=== Include the common MG5aMC Makefile options
+
+# OM: including make_opts is crucial for MG5aMC flag consistency/documentation
+# AV: disable the inclusion of make_opts if the file has not been generated (standalone cudacpp)
+ifneq ($(wildcard ../../Source/make_opts),)
+  include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Redefine BACKEND if the current value is 'cppauto'
+
+# Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+ifeq ($(BACKEND),cppauto)
+  ifeq ($(UNAME_P),ppc64le)
+    override BACKEND = cppsse4
+  else ifeq ($(UNAME_P),arm)
+    override BACKEND = cppsse4
+  else ifeq ($(wildcard /proc/cpuinfo),)
+    override BACKEND = cppnone
+    ###$(warning Using BACKEND='$(BACKEND)' because host SIMD features cannot be read from /proc/cpuinfo)
+  else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
+    override BACKEND = cpp512y
+  else ifeq ($(shell grep -m1 -c avx2 /proc/cpuinfo),1)
+    override BACKEND = cppavx2
+    ###ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
+    ###  $(warning Using BACKEND='$(BACKEND)' because host does not support avx512vl)
+    ###else
+    ###  $(warning Using BACKEND='$(BACKEND)' because this is faster than avx512vl for clang)
+    ###endif
+  else ifeq ($(shell grep -m1 -c sse4_2 /proc/cpuinfo),1)
+    override BACKEND = cppsse4
+  else
+    override BACKEND = cppnone
+  endif
+  $(info BACKEND=$(BACKEND) (was cppauto))
+else
+  $(info BACKEND='$(BACKEND)')
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the GPU compiler (CUDA or HIP)
+#=== (note, this is done also for C++, as NVTX and CURAND/ROCRAND are also needed by the C++ backends)
+
+# Set CUDA_HOME from the path to nvcc, if it exists
+#override CUDA_HOME = $(patsubst %%/bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
+
+# Set HIP_HOME from the path to hipcc, if it exists
+#override HIP_HOME = $(patsubst %%/bin/hipcc,%%,$(shell which hipcc 2>/dev/null))
+
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
+# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
+ifneq ($(CUDA_HOME),)
+  USE_NVTX ?=-DUSE_NVTX
+  CUDA_INC = -I$(CUDA_HOME)/include/
+else
+  override USE_NVTX=
+  override CUDA_INC=
+endif
+
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
+
+INCFLAGS = -I.
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+
+# Dependency on src directory
+ifeq ($(GPUCC),)
+MG5AMC_COMMONLIB = mg5amc_common_cpp
+else
+MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX)
+endif
+LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lrex -ltearex
+INCFLAGS += -I../../src
+
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+  override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+  override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+  override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+  override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+# override CXXNAMESUFFIX = _$(CXXNAME)
+
+# # Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk)
+# export CXXNAMESUFFIX
+
+# Dependency on test directory
+# Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
+# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
+###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation
+###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation
+TESTDIRCOMMON = ../../../../../test
+TESTDIRLOCAL = ../../test
+ifneq ($(wildcard $(GTEST_ROOT)),)
+  TESTDIR =
+else ifneq ($(LOCALGTEST),)
+  TESTDIR=$(TESTDIRLOCAL)
+  GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
+else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
+  TESTDIR = $(TESTDIRCOMMON)
+  GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
+else
+  TESTDIR =
+endif
+ifneq ($(GTEST_ROOT),)
+  GTESTLIBDIR = $(GTEST_ROOT)/lib64/
+  GTESTLIBS = $(GTESTLIBDIR)/libgtest.a
+  GTESTINC = -I$(GTEST_ROOT)/include
+else
+  GTESTLIBDIR =
+  GTESTLIBS =
+  GTESTINC =
+endif
+###$(info GTEST_ROOT = $(GTEST_ROOT))
+###$(info LOCALGTEST = $(LOCALGTEST))
+###$(info TESTDIR = $(TESTDIR))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
+
+# PowerPC-specific CXX compiler flags (being reviewed)
+ifeq ($(UNAME_P),ppc64le)
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for cppnone and cppsse4
+  # Throughput references without the extra flags below: cppnone=1.41-1.42E6, cppsse4=2.15-2.19E6
+else
+  ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4...
+  ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
+endif
+
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
+ifeq ($(UNAME_P),ppc64le)
+  GPUFLAGS+= $(XCOMPILERFLAG) -mno-float128
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults for OMPFLAGS
+
+# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ###$(info USEOPENMP==1: will build with OpenMP if possible)
+  ifneq ($(findstring hipcc,$(GPUCC)),)
+    override OMPFLAGS = # disable OpenMP MT when using hipcc #802
+  else ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+  else ifneq ($(shell $(CXX) --version | egrep '^clang version 16'),)
+    ###override OMPFLAGS = # disable OpenMP on clang16 #904
+    $(error OpenMP is not supported by cudacpp on clang16 - issue #904)
+  else ifneq ($(shell $(CXX) --version | egrep '^clang version 17'),)
+    ###override OMPFLAGS = # disable OpenMP on clang17 #904
+    $(error OpenMP is not supported by cudacpp on clang17 - issue #904)
+  else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
+    override OMPFLAGS = -fopenmp
+    ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+  ###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+  else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+    override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+    ###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
+  else
+    override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+    ###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
+  endif
+else
+  ###$(info USEOPENMP!=1: will build without OpenMP)
+  override OMPFLAGS =
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND
+
+# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out)
+###$(info RNDGEN=$(RNDGEN))
+ifneq ($(RNDGEN),)
+  $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!)
+  ifeq ($(RNDGEN),hasCurand)
+    override HASCURAND = $(RNDGEN)
+  else ifeq ($(RNDGEN),hasNoCurand)
+    override HASCURAND = $(RNDGEN)
+  else ifneq ($(RNDGEN),hasNoCurand)
+    $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!)
+  endif
+endif
+
+# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND
+# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
+ifeq ($(HASCURAND),)
+  ifeq ($(GPUCC),) # CPU-only build
+    ifneq ($(CUDA_HOME),)
+      # By default, assume that curand is installed if a CUDA installation exists
+      override HASCURAND = hasCurand
+    else
+      override HASCURAND = hasNoCurand
+    endif
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override HASCURAND = hasCurand
+  else # non-Nvidia GPU build
+    override HASCURAND = hasNoCurand
+  endif
+endif
+
+# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND
+# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...)
+ifeq ($(HASHIPRAND),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASHIPRAND = hasNoHiprand
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override HASHIPRAND = hasHiprand
+  else # non-AMD GPU build
+    override HASHIPRAND = hasNoHiprand
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
+
+# Set the build flags appropriate to OMPFLAGS
+$(info OMPFLAGS=$(OMPFLAGS))
+CXXFLAGS += $(OMPFLAGS)
+
+# Set the build flags appropriate to each BACKEND choice (example: "make BACKEND=cppnone")
+# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
+# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
+ifeq ($(UNAME_P),ppc64le)
+  ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers)
+  else ifeq ($(BACKEND),cppavx2)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  else ifeq ($(BACKEND),cpp512y)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  else ifeq ($(BACKEND),cpp512z)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  endif
+else ifeq ($(UNAME_P),arm)
+  ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers)
+  else ifeq ($(BACKEND),cppavx2)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  else ifeq ($(BACKEND),cpp512y)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  else ifeq ($(BACKEND),cpp512z)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  endif
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+  ifeq ($(BACKEND),cppnone)
+    override AVXFLAGS = -mno-sse3 # no SIMD
+  else ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(BACKEND),cppavx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(BACKEND),cpp512y)
+    override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(BACKEND),cpp512z)
+    override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  endif
+else
+  ifeq ($(BACKEND),cppnone)
+    override AVXFLAGS = -march=x86-64 # no SIMD (see #588)
+  else ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(BACKEND),cppavx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(BACKEND),cpp512y)
+    override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(BACKEND),cpp512z)
+    override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  endif
+endif
+# For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
+ifeq ($(GPUCC),)
+  CXXFLAGS+= $(AVXFLAGS)
+endif
+
+# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
+$(info FPTYPE='$(FPTYPE)')
+ifeq ($(FPTYPE),d)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+else ifeq ($(FPTYPE),f)
+  CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+else ifeq ($(FPTYPE),m)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+else
+  $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
+endif
+
+# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1")
+$(info HELINL='$(HELINL)')
+ifeq ($(HELINL),1)
+  CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS += -DMGONGPU_INLINE_HELAMPS
+else ifneq ($(HELINL),0)
+  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
+$(info HRDCOD='$(HRDCOD)')
+ifeq ($(HRDCOD),1)
+  CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS += -DMGONGPU_HARDCODE_PARAM
+else ifneq ($(HRDCOD),0)
+  $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
+endif
+
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND
+
+$(info HASCURAND=$(HASCURAND))
+$(info HASHIPRAND=$(HASHIPRAND))
+override RNDCXXFLAGS=
+override RNDLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand")
+ifeq ($(HASCURAND),hasNoCurand)
+  override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND
+else ifeq ($(HASCURAND),hasCurand)
+  override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+else
+  $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported)
+endif
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand")
+ifeq ($(HASHIPRAND),hasNoHiprand)
+  override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND
+else ifeq ($(HASHIPRAND),hasHiprand)
+  override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand
+else ifneq ($(HASHIPRAND),hasHiprand)
+  $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported)
+endif
+
+#$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
+#$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure Position-Independent Code
+CXXFLAGS += -fPIC
+GPUFLAGS += $(XCOMPILERFLAG) -fPIC
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed)
+# (Rationale: avoid mixing of builds with different random number generators)
+override TAG = $(patsubst cpp%%,%%,$(BACKEND))_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND)
+
+# Export TAG (so that there is no need to check/define it again in cudacpp_src.mk)
+export TAG
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+override BUILDDIR = $(CUDACPP_BUILDDIR)
+ifeq ($(USEBUILDDIR),1)
+  override LIBDIR = ../../lib/$(BUILDDIR)
+  override LIBDIRRPATH = '$$ORIGIN/../$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR == 1))
+else
+  override LIBDIR = ../../lib
+  override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR != 1))
+endif
+###override INCDIR = ../../include
+###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override CXXLIBFLAGSRPATH =
+  override GPULIBFLAGSRPATH =
+  override CXXLIBFLAGSRPATH2 =
+  override GPULIBFLAGSRPATH2 =
+else
+  # RPATH to gpu/cpp libs when linking executables
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override GPULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  # RPATH to common lib when linking gpu/cpp libs
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override GPULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+endif
+
+# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
+override RUNTIME =
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+
+ifeq ($(GPUCC),)
+  cxx_rwgtlib=$(BUILDDIR)/librwgt_cpp.so
+else
+  gpu_rwgtlib=$(BUILDDIR)/librwgt_$(GPUSUFFIX).so
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all.$(TAG)
+
+# First target (default goal)
+ifeq ($(GPUCC),)
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(cxx_rwgtlib) 
+else
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(gpu_rwgtlib) 
+endif
+
+# Target (and build options): debug
+MAKEDEBUG=
+debug: OPTFLAGS   = -g -O0
+debug: CUDA_OPTFLAGS = -G
+debug: MAKEDEBUG := debug
+debug: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+$(BUILDDIR)/.build.$(TAG):
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo "  $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+# Apply special build flags only to CrossSectionKernel_<cpp|$(GPUSUFFIX)>.o (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
+$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math
+$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math
+endif
+
+# Apply special build flags only to check_sa_<cpp|$(GPUSUFFIX)>.o (NVTX in timermap.h, #679)
+$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+$(BUILDDIR)/rwgt_runner_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+$(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+$(BUILDDIR)/rwgt_runner_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+
+# Apply special build flags only to check_sa_<cpp|$(GPUSUFFIX)>.o and (Cu|Hip)randRandomNumberKernel_<cpp|$(GPUSUFFIX)>.o
+$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/rwgt_runner_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/rwgt_runner_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+ifeq ($(HASCURAND),hasCurand) # curand headers, #679
+$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC)
+endif
+ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers
+$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC)
+endif
+
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
+endif
+endif
+
+# Generic target and build rules: objects from C++ compilation
+# (NB do not include CUDA_INC here! add it only for NVTX or curand #679)
+$(BUILDDIR)/%%_cpp.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@
+
+# Generic target and build rules: objects from CUDA or HIP compilation
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): common (src) library
+commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc $(BUILDDIR)/.build.$(TAG)
+	$(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE)
+
+#-------------------------------------------------------------------------------
+
+processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+###$(info processid_short=$(processid_short))
+
+MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
+
+ifneq ($(GPUCC),)
+MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
+endif
+
+# Target (and build rules): C++ and CUDA/HIP shared libraries
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
+	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+
+ifneq ($(GPUCC),)
+$(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
+$(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
+$(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+# Bypass std::filesystem completely to ease portability on LUMI #803
+#ifneq ($(findstring hipcc,$(GPUCC)),)
+#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+#else
+#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+#endif
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): C++ rwgt libraries
+# ZW: the -Bsymbolic flag ensures that function calls will be handled internally by the library, rather than going to global context
+cxx_rwgtfiles := $(BUILDDIR)/rwgt_runner_cpp.o $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(BUILDDIR)/fbridge_cpp.o $(cxx_objects_lib) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o
+$(cxx_rwgtlib): LIBFLAGS += $(CXXLIBFLAGSRPATH)
+$(cxx_rwgtlib): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_rwgtfiles) $(cxx_objects_lib)
+	$(CXX) -shared -Wl,-Bsymbolic -o $@ $(BUILDDIR)/rwgt_runner_cpp.o  $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) $(BUILDDIR)/fbridge_cpp.o $(cxx_objects_lib) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS)
+
+ifneq ($(GPUCC),)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(gp_objects_lib) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
+	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_rwgtlib): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_rwgtlib): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(gpu_rwgtlib): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(gpu_rwgtlib): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+gpu_rwgtfiles := $(BUILDDIR)/rwgt_runner_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
+$(gpu_rwgtlib): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_rwgtfiles) $(gpu_objects_lib)
+	$(GPUCC) -shared -Xcompiler \"-Wl,-Bsymbolic\" -o $@ $(BUILDDIR)/rwgt_runner_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(gpu_objects_exe) $(gpu_objects_lib) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): test objects and test executable
+ifeq ($(GPUCC),)
+$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS)
+$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC)
+$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt
+$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o
+$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions
+else
+$(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS)
+$(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC)
+$(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt
+$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o
+$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions
+endif
+
+ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838)
+ifeq ($(GPUCC),)
+$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS)
+$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC)
+$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o
+$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests
+else
+$(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS)
+$(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC)
+$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o
+$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests
+endif
+endif
+
+ifeq ($(GPUCC),)
+$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS)
+$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC)
+$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o
+$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o
+else
+$(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS)
+$(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(gpu_testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o
+$(gpu_testmain): gpu_objects_exe  += $(BUILDDIR)/runTest_$(GPUSUFFIX).o
+endif
+
+ifeq ($(GPUCC),)
+$(cxx_testmain): $(GTESTLIBS)
+$(cxx_testmain): INCFLAGS +=  $(GTESTINC)
+$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc
+else
+$(gpu_testmain): $(GTESTLIBS)
+$(gpu_testmain): INCFLAGS +=  $(GTESTINC)
+$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc
+endif
+
+ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds)
+ifneq ($(OMPFLAGS),)
+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604)
+else
+$(cxx_testmain): LIBFLAGS += -lgomp
+endif
+endif
+endif
+
+# Test quadmath in testmisc.cc tests for constexpr_math #627
+###ifeq ($(GPUCC),)
+###$(cxx_testmain): LIBFLAGS += -lquadmath
+###else
+###$(gpu_testmain): LIBFLAGS += -lquadmath
+###endif
+
+# Bypass std::filesystem completely to ease portability on LUMI #803
+###ifneq ($(findstring hipcc,$(GPUCC)),)
+###$(gpu_testmain): LIBFLAGS += -lstdc++fs
+###endif
+
+ifeq ($(GPUCC),) # link only runTest_cpp.o
+$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
+	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
+else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
+$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
+ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread  -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64
+else
+	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+endif
+endif
+
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
+# Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
+$(GTESTLIBS):
+ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
+else
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: build all targets in all BACKEND modes (each BACKEND mode in a separate build directory)
+# Split the bldall target into separate targets to allow parallel 'make -j bldall' builds
+# (Obsolete hack, no longer needed as there is no INCDIR: add a fbridge.inc dependency to bldall, to ensure it is only copied once for all BACKEND modes)
+bldcuda:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda -f $(CUDACPP_MAKEFILE)
+
+bldhip:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip -f $(CUDACPP_MAKEFILE)
+
+bldnone:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone -f $(CUDACPP_MAKEFILE)
+
+bldsse4:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 -f $(CUDACPP_MAKEFILE)
+
+bldavx2:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 -f $(CUDACPP_MAKEFILE)
+
+bld512y:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y -f $(CUDACPP_MAKEFILE)
+
+bld512z:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
+
+ifeq ($(UNAME_P),ppc64le)
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
+bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
+bldavxs: bldnone bldsse4
+else
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavx2 bld512y bld512z
+bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(HIP_HOME),)
+ifneq ($(CUDA_HOME),)
+bldall: bldhip bldcuda bldavxs
+else
+bldall: bldhip bldavxs
+endif
+else
+ifneq ($(CUDA_HOME),)
+bldall: bldcuda bldavxs
+else
+bldall: bldavxs
+endif
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean
+
+clean:
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.so $(BUILDDIR)/*.exe
+	rm -f $(LIBDIR)/lib*.so
+endif
+	$(MAKE) -C ../../src clean -f $(CUDACPP_SRC_MAKEFILE)
+###	rm -rf $(INCDIR)
+
+cleanall:
+	@echo
+	$(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE)
+	@echo
+	$(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE)
+	rm -rf build.*
+
+# Target: clean the builds as well as the gtest installation(s)
+distclean: cleanall
+ifneq ($(wildcard $(TESTDIRCOMMON)),)
+	$(MAKE) -C $(TESTDIRCOMMON) clean
+endif
+	$(MAKE) -C $(TESTDIRLOCAL) clean
+
+#-------------------------------------------------------------------------------
+
+# Target: show system and compiler information
+info:
+	@echo ""
+	@uname -spn # e.g. Linux nodename.cern.ch x86_64
+ifeq ($(UNAME_S),Darwin)
+	@sysctl -a | grep -i brand
+	@sysctl -a | grep machdep.cpu | grep features || true
+	@sysctl -a | grep hw.physicalcpu:
+	@sysctl -a | grep hw.logicalcpu:
+else
+	@cat /proc/cpuinfo | grep "model name" | sort -u
+	@cat /proc/cpuinfo | grep "flags" | sort -u
+	@cat /proc/cpuinfo | grep "cpu cores" | sort -u
+	@cat /proc/cpuinfo | grep "physical id" | sort -u
+endif
+	@echo ""
+ifneq ($(shell which nvidia-smi 2>/dev/null),)
+	nvidia-smi -L
+	@echo ""
+endif
+	@echo USECCACHE=$(USECCACHE)
+ifeq ($(USECCACHE),1)
+	ccache --version | head -1
+endif
+	@echo ""
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
+endif
+	@echo ""
+	@echo CXX=$(CXX)
+ifneq ($(shell $(CXX) --version | grep ^clang),)
+	@echo $(CXX) -v
+	@$(CXX) -v |& egrep -v '(Found|multilib)'
+	@readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}'
+else
+	$(CXX) --version
+endif
+	@echo ""
+	@echo FC=$(FC)
+	$(FC) --version
+
+#-------------------------------------------------------------------------------
+
+# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe)
+# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!]
+# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe']
+test: runTest cmpFcheck
+
+# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe)
+runTest: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/runTest_cpp.exe
+else
+	$(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe
+endif
+
+# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events)
+runCheck: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2
+else
+	$(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
+endif
+
+# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events)
+runFcheck: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2
+else
+	$(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2
+endif
+
+# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events)
+cmpFcheck: all.$(TAG)
+	@echo
+ifeq ($(GPUCC),)
+	@echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+else
+	@echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU)   = $${me1}"; echo "Avg ME (F77/GPU)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+endif
+
+# Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
+cuda-memcheck: all.$(TAG)
+	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
+
+#-------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/rex.mk b/PLUGIN/CUDACPP_OUTPUT/MadtRex/rex.mk
new file mode 100644
index 0000000000..03f858c8fa
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/rex.mk
@@ -0,0 +1,26 @@
+CXX = g++
+CXXFLAGS = -std=c++17 -Wall -Wextra -fPIC -O3
+LDFLAGS = -L. -lrex -Wl,-rpath=.
+
+REX_SRC = Rex.cc
+REX_HDR = Rex.h
+REX_OBJ = Rex.o
+REX_TARGET = librex.so
+
+TEA_SRC = teaRex.cc
+TEA_HDR = teaRex.h
+TEA_OBJ = teaRex.o
+TEA_TARGET = libtearex.so
+
+all: $(REX_TARGET) $(TEA_TARGET) 
+
+
+# Build shared library
+$(REX_TARGET): $(REX_SRC)
+	$(CXX) $(CXXFLAGS) -shared -o $@ $^
+
+$(TEA_TARGET): $(TEA_SRC) $(REX_TARGET)
+	$(CXX) $(CXXFLAGS) -shared -o $@ $(TEA_SRC) $(LDFLAGS)
+
+clean:
+	rm -f $(REX_TARGET) $(TEA_TARGET) $(REX_OBJ) $(TEA_OBJ)
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.cc b/PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.cc
new file mode 100644
index 0000000000..24f2ea49ae
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.cc
@@ -0,0 +1,939 @@
+/***
+ *     _            ______
+ *    | |           | ___ \
+ *    | |_ ___  __ _| |_/ /_____  __
+ *    | __/ _ \/ _` |    // _ \ \/ /
+ *    | ||  __/ (_| | |\ \  __/>  <
+ *     \__\___|\__,_\_| \_\___/_/\_\
+ *
+ ***/
+//
+// *t*ensorial *e*vent *a*daption with *R*e*x* Version 1.0.0
+// teaRex is an extension to the Rex library for the generic reweighting of parton-level events.
+// It provides a flexible framework for applying weight modifications to events based on user-defined criteria,
+// using the underlying Rex formats to sort, extract, and rewrite event-level information,
+// and extending it to allow for generic reweighting using any information stored in an LHE file as input for a
+// user-provided reweighting function acting on REX::process objects, which are SoA (Structure of Arrays)
+// objects for storing event information. Users can either provide the REX::process objects themselves,
+// or use the flexible Rex sorting architecture to extract the necessary information from an LHE file.
+//
+// Copyright © 2023-2025 CERN, CERN Author Zenny Wettersten.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// All rights not expressly granted are reserved.
+//
+
+#ifndef _TEAREX_CPP_
+#define _TEAREX_CPP_
+
+#include "teaRex.h"
+
+namespace REX::tea
+{
+
+    bool true_function()
+    {
+        return true;
+    }
+
+    void rwgt_slha::rwgt_card::add_param(const std::string &block_name, std::pair<int, double> param)
+    {
+        if (this->blocks.find(block_name) == this->blocks.end())
+            this->blocks[block_name] = rwgt_block{block_name, {}};
+        this->blocks[block_name].params.push_back(param);
+    }
+
+    void rwgt_slha::rwgt_card::add_param(const std::string &block_name, int param_id, double param_value)
+    {
+        this->add_param(block_name, std::make_pair(param_id, param_value));
+    }
+
+    void rwgt_slha::parse_rwgt_card(std::istream &is)
+    {
+        this->cards.clear();
+        size_t curr_launch = 0;
+        this->cards.emplace_back();
+        std::string line;
+        bool first_launch = false;
+        while (std::getline(is, line))
+        {
+            if (line.empty() || line[0] == '#')
+                continue; // Skip empty lines and comments
+            std::string iss = line.c_str();
+            if (iss.find("launch") != std::string::npos)
+            {
+                if (!first_launch)
+                {
+                    first_launch = true;
+                }
+                else
+                {
+                    this->cards.emplace_back();
+                    curr_launch++;
+                }
+                size_t name_pos = iss.find("rwgt_name");
+                if (name_pos != std::string::npos)
+                {
+                    std::string curr_name = iss.substr(name_pos + 10);
+                    auto name_lines = REX::blank_splitter(curr_name);
+                    this->cards.back().launch_name = std::string(name_lines[0]);
+                }
+                continue;
+            }
+            if (iss.find("set") != std::string::npos)
+            {
+                auto rwgt_line = REX::blank_splitter(iss);
+                if (rwgt_line[0] != "set")
+                    continue;
+                if (rwgt_line.size() < 3)
+                    continue;
+                if (rwgt_line.size() == 3)
+                    throw std::runtime_error("rwgt_slha::parse_rwgt_card: \"set\" line appears to use parameter names. This is not supported by teaRex.");
+                if (!first_launch)
+                {
+                    REX::warning("rwgt_slha::parse_rwgt_card: \"set\" line appears before first launch command. Assuming launch command missed. May end up appending meaningless reweighting iterations.");
+                    first_launch = true;
+                }
+                size_t curr_ind_of_line = 1;
+                if (rwgt_line[curr_ind_of_line] == "param_card")
+                    ++curr_ind_of_line;
+                std::string curr_block = std::string(rwgt_line[curr_ind_of_line]);
+                ++curr_ind_of_line;
+                int curr_param = REX::ctoi(rwgt_line[curr_ind_of_line]);
+                ++curr_ind_of_line;
+                double curr_val = REX::ctod(rwgt_line[curr_ind_of_line]);
+                this->cards[curr_launch].add_param(curr_block, curr_param, curr_val);
+                this->cards[curr_launch].rwgt_com += line + "\n";
+                continue;
+            }
+        }
+    }
+
+    rwgt_slha &rwgt_slha::set_card_path(const std::string &path)
+    {
+        this->card_path = path;
+        return *this;
+    }
+
+    rwgt_slha &rwgt_slha::set_mod_card_path(const std::string &path)
+    {
+        this->mod_card_path = path;
+        return *this;
+    }
+
+    bool rwgt_slha::move_param_card(const std::string &new_path)
+    {
+        if (this->card_path.empty())
+            throw std::runtime_error("rwgt_slha::move_param_card: card_path not set");
+        if (new_path.empty() && this->mod_card_path.empty())
+        {
+            this->mod_card_path = this->card_path + ".mod";
+        }
+        else if (!new_path.empty())
+        {
+            this->mod_card_path = new_path;
+        }
+        try
+        {
+            std::filesystem::rename(this->card_path, this->mod_card_path);
+        }
+        catch (const std::filesystem::filesystem_error &e)
+        {
+            throw std::runtime_error("rwgt_slha::move_param_card: failed to rename original card to .mod backup: " + std::string(e.what()));
+        }
+        return true;
+    }
+
+    bool rwgt_slha::remove_mod_card()
+    {
+        if (this->card_path.empty())
+            throw std::runtime_error("rwgt_slha::remove_mod_card: card_path not set");
+        if (this->mod_card_path.empty())
+            throw std::runtime_error("rwgt_slha::remove_mod_card: mod_card_path not set");
+        try
+        {
+            std::filesystem::remove(this->card_path);
+        }
+        catch (const std::filesystem::filesystem_error &e)
+        {
+            throw std::runtime_error("rwgt_slha::remove_mod_card: failed to remove mod card: " + std::string(e.what()));
+        }
+        try
+        {
+            std::filesystem::rename(this->mod_card_path, this->card_path);
+        }
+        catch (const std::filesystem::filesystem_error &e)
+        {
+            throw std::runtime_error("rwgt_slha::remove_mod_card: failed to rename .mod backup to original card: " + std::string(e.what()));
+        }
+        return true;
+    }
+
+    bool rwgt_slha::write_rwgt_card(size_t idx)
+    {
+        if (idx >= this->cards.size())
+            throw std::out_of_range("rwgt_slha::write_rwgt_card: index out of range");
+        if (this->card_path.empty())
+            throw std::runtime_error("rwgt_slha::write_rwgt_card: card_path not set");
+        if (this->mod_card_path.empty())
+            this->mod_card_path = this->card_path + ".mod";
+        if (!std::filesystem::exists(this->mod_card_path))
+        {
+            this->move_param_card();
+        }
+        if (!this->orig_params.blocks.empty())
+        {
+            for (const auto &[key, val] : this->orig_params.blocks)
+            {
+                if (REX::to_upper(key) == "DECAY")
+                {
+                    for (const auto &[pid, width] : val.params)
+                    {
+                        this->REX::slha::set_decay(pid, width);
+                    }
+                }
+                else
+                {
+                    for (auto [param_id, param_value] : val.params)
+                    {
+                        this->REX::slha::set(key, param_id, param_value);
+                    }
+                }
+            }
+            this->orig_params.blocks.clear();
+        }
+        for (const auto &[key, val] : this->cards[idx].blocks)
+        {
+            if (REX::to_upper(key) == "DECAY")
+            {
+                for (const auto &[pid, width] : val.params)
+                {
+                    auto curr_decay = this->REX::slha::get_decay(pid);
+                    this->orig_params.add_param("DECAY", pid, curr_decay);
+                    this->REX::slha::set_decay(pid, width);
+                }
+            }
+            else
+            {
+                for (auto [param_id, param_value] : val.params)
+                {
+                    auto curr_param = this->REX::slha::get(key, param_id);
+                    this->orig_params.add_param(key, param_id, curr_param);
+                    this->REX::slha::set(key, param_id, param_value);
+                }
+            }
+        }
+        std::ofstream ofs(this->card_path);
+        if (!ofs)
+            throw std::runtime_error("rwgt_slha::write_rwgt_card: failed to open file");
+        this->REX::slha::write(ofs);
+        return true;
+    }
+
+    std::vector<std::function<bool()>> rwgt_slha::get_card_writers()
+    {
+        std::vector<std::function<bool()>> writers;
+        for (size_t i = 0; i < this->cards.size(); ++i)
+        {
+            writers.push_back([this, i]()
+                              { return this->write_rwgt_card(i); });
+        }
+        return writers;
+    }
+
+    std::vector<std::string> rwgt_slha::get_launch_names()
+    {
+        std::vector<std::string> names;
+        for (const auto &card : this->cards)
+        {
+            names.push_back(card.launch_name);
+        }
+        return names;
+    }
+
+    std::vector<std::string> rwgt_slha::get_rwgt_commands()
+    {
+        std::vector<std::string> commands;
+        for (const auto &card : this->cards)
+        {
+            commands.push_back(card.rwgt_com);
+        }
+        return commands;
+    }
+
+    threadPool::threadPool(unsigned nthreads)
+        : stop_(false), active_(0)
+    {
+        workers_.reserve(nthreads);
+        for (unsigned i = 0; i < nthreads; ++i)
+        {
+            workers_.emplace_back([this]
+                                  {
+                for (;;) {
+                    Task task;
+                    {
+                        std::unique_lock<std::mutex> lk(m_);
+                        cv_.wait(lk, [this]{ return stop_ || !q_.empty(); });
+                        if (stop_ && q_.empty()) return;
+                        task = std::move(q_.front());
+                        q_.pop();
+                        ++active_;
+                    }
+                    try {
+                        task();
+                    } catch (...) {
+                        // Record first exception and signal cancellation
+                        {
+                            std::lock_guard<std::mutex> g(err_m_);
+                            if (!first_error_) first_error_ = std::current_exception();
+                        }
+                        cancel_.store(true, std::memory_order_relaxed);
+                    }
+                    {
+                        std::lock_guard<std::mutex> lk(m_);
+                        --active_;
+                        if (q_.empty() && active_ == 0) drained_.notify_all();
+                    }
+                } });
+        }
+    }
+
+    threadPool::~threadPool()
+    {
+        {
+            std::lock_guard<std::mutex> lk(m_);
+            stop_ = true;
+        }
+        cv_.notify_all();
+        for (auto &t : workers_)
+            t.join();
+    }
+
+    void threadPool::enqueue(Task t)
+    {
+        {
+            std::lock_guard<std::mutex> lk(m_);
+            q_.push(std::move(t));
+        }
+        cv_.notify_one();
+    }
+
+    void threadPool::begin_batch()
+    {
+        cancel_.store(false, std::memory_order_relaxed);
+        std::lock_guard<std::mutex> g(err_m_);
+        first_error_ = nullptr;
+    }
+
+    void threadPool::wait_batch()
+    {
+        std::unique_lock<std::mutex> lk(m_);
+        drained_.wait(lk, [this]
+                      { return (q_.empty() && active_ == 0) || first_error_; });
+        lk.unlock();
+        if (first_error_)
+            std::rethrow_exception(first_error_);
+    }
+
+    bool threadPool::cancel_requested() const noexcept
+    {
+        return cancel_.load(std::memory_order_relaxed);
+    }
+
+    procReweightor::procReweightor(weightor reweight_function)
+    {
+        this->reweight_functions.push_back(reweight_function);
+    }
+
+    procReweightor::procReweightor(weightor reweight_function, eventBelongs selector)
+    {
+        this->reweight_functions.push_back(reweight_function);
+        this->event_checker = std::make_shared<eventBelongs>(std::move(selector));
+        this->event_checker_fn = this->event_checker->get_event_bool();
+    }
+
+    procReweightor::procReweightor(weightor reweight_function, std::shared_ptr<eventBelongs> selector)
+    {
+        this->reweight_functions.push_back(reweight_function);
+        this->event_checker = selector;
+        this->event_checker_fn = selector->get_event_bool();
+    }
+
+    procReweightor::procReweightor(std::vector<weightor> rwgts)
+    {
+        this->reweight_functions = rwgts;
+    }
+
+    procReweightor::procReweightor(std::vector<weightor> rwgts, std::shared_ptr<eventBelongs> selector)
+    {
+        this->reweight_functions = rwgts;
+        this->event_checker = selector;
+        this->event_checker_fn = selector->get_event_bool();
+    }
+
+    procReweightor::procReweightor(std::vector<weightor> rwgts, eventBelongs selector)
+    {
+        this->reweight_functions = rwgts;
+        this->event_checker = std::make_shared<eventBelongs>(std::move(selector));
+        this->event_checker_fn = this->event_checker->get_event_bool();
+    }
+
+    procReweightor::procReweightor(std::vector<weightor> rwgts, eventBelongs selector, weightor normaliser)
+    {
+        this->reweight_functions = rwgts;
+        this->event_checker = std::make_shared<eventBelongs>(std::move(selector));
+        this->event_checker_fn = this->event_checker->get_event_bool();
+        this->normaliser = normaliser;
+    }
+
+    procReweightor &procReweightor::set_event_checker(eventBelongs checker)
+    {
+        this->event_checker = std::make_shared<eventBelongs>(std::move(checker));
+        this->event_checker_fn = this->event_checker->get_event_bool();
+        return *this;
+    }
+
+    procReweightor &procReweightor::set_event_checker(REX::event_bool_fn checker)
+    {
+        this->event_checker = nullptr;
+        this->event_checker_fn = checker;
+        return *this;
+    }
+
+    procReweightor &procReweightor::set_normaliser(weightor normaliser)
+    {
+        this->normaliser = normaliser;
+        return *this;
+    }
+
+    procReweightor &procReweightor::set_reweight_functions(weightor rwgt)
+    {
+        this->reweight_functions = {rwgt};
+        if (!this->normaliser)
+            this->normaliser = rwgt;
+        return *this;
+    }
+
+    procReweightor &procReweightor::set_reweight_functions(std::vector<weightor> rwgts)
+    {
+        this->reweight_functions = rwgts;
+        return *this;
+    }
+
+    procReweightor &procReweightor::add_reweight_function(weightor rwgt)
+    {
+        this->reweight_functions.push_back(rwgt);
+        return *this;
+    }
+
+    procReweightor &procReweightor::set_process(std::shared_ptr<process> p)
+    {
+        this->proc = p;
+        return *this;
+    }
+
+    // Member functions for handling reweighting
+    void procReweightor::initialise()
+    {
+        if (!this->proc)
+            throw std::runtime_error("procReweightor::initialise: process not set before initialisation");
+        if (!this->normaliser)
+        {
+            if (this->reweight_functions.empty())
+            {
+                warning("procReweightor::initialise: no reweight functions set, process will only yield zero weights.");
+                this->normalisation = std::vector<double>(this->proc->weight_.size(), 0.0);
+                return;
+            }
+            if (this->reweight_functions.size() != 1)
+                warning("procReweightor::initialise: multiple reweight functions set, assuming first is default evaluator and using it for normalisation.");
+            this->normaliser = this->reweight_functions[0];
+        }
+        auto normalised = this->normaliser(*this->proc);
+        if (!normalised)
+            throw std::runtime_error("procReweightor::initialise: normaliser function returned null pointer");
+        if (normalised->size() != this->proc->weight_.size())
+            throw std::runtime_error("procReweightor::initialise: normalisation vector size does not match number of original weights in process");
+        this->normalisation = *normalised;
+        std::transform(this->normalisation.begin(), this->normalisation.end(), this->normalisation.begin(),
+                       [](double val)
+                       { return (val == 0.0) ? 0.0 : 1.0 / val; });
+        this->normalisation = *REX::vec_elem_mult<double>(this->normalisation, this->proc->weight_);
+    }
+
+    void procReweightor::initialise(std::shared_ptr<process> p)
+    {
+        this->proc = p;
+        initialise();
+    }
+
+    void procReweightor::evaluate()
+    {
+        return this->evaluate(0);
+    }
+
+    void procReweightor::evaluate(size_t amp)
+    {
+        if (!this->proc)
+            throw std::runtime_error("procReweightor::evaluate: process not set before evaluation");
+        if (this->reweight_functions.size() <= amp)
+            return this->append_zero_weights();
+        if (this->normalisation.empty())
+            this->initialise();
+        auto newweights = this->reweight_functions[amp](*this->proc);
+        if (!newweights)
+            throw std::runtime_error("procReweightor::evaluate: reweight function returned null pointer");
+        this->backlog.push_back(std::move(*newweights));
+    }
+
+    void procReweightor::append_zero_weights()
+    {
+        if (!this->proc)
+            throw std::runtime_error("procReweightor::append_zero_weights: process not set before appending zero weights");
+        this->backlog.push_back(std::vector<double>(this->proc->weight_.size(), 0.0));
+    }
+
+    void procReweightor::append_backlog()
+    {
+        if (this->normalisation.empty())
+            throw std::runtime_error("procReweightor::append_backlog: normalisation is empty; call initialise() first");
+
+        for (auto &weights : this->backlog)
+        {
+            if (weights.size() != this->normalisation.size())
+                throw std::runtime_error("procReweightor::append_backlog: size mismatch between weights and normalisation");
+
+            this->proc->append_wgts(*REX::vec_elem_mult<double>(weights, this->normalisation));
+        }
+        this->backlog.clear();
+    }
+
+    reweightor::reweightor(lhe &&mother) : lhe(std::move(mother)) {}
+
+    reweightor::reweightor(const lhe &mother) : lhe(mother) {}
+
+    reweightor::reweightor(lhe &&mother, std::vector<std::shared_ptr<procReweightor>> rws) : lhe(std::move(mother)), reweightors(rws) {}
+
+    reweightor::reweightor(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws) : lhe(mother), reweightors(rws) {}
+
+    reweightor::reweightor(lhe &&mother, std::vector<std::shared_ptr<procReweightor>> rws, std::vector<iterator> iters) : lhe(std::move(mother)), reweightors(rws), iterators(std::move(iters)) {}
+
+    reweightor::reweightor(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws, std::vector<iterator> iters) : lhe(mother), reweightors(rws), iterators(std::move(iters)) {}
+
+    reweightor::reweightor(lhe &&mother, std::vector<procReweightor> rws) : lhe(std::move(mother))
+    {
+        this->set_reweightors(rws);
+    }
+
+    reweightor::reweightor(const lhe &mother, std::vector<procReweightor> rws) : lhe(mother)
+    {
+        this->set_reweightors(rws);
+    }
+
+    reweightor::reweightor(lhe &&mother, std::vector<procReweightor> rws, std::vector<iterator> iters) : lhe(std::move(mother)), iterators(std::move(iters))
+    {
+        this->set_reweightors(rws);
+    }
+
+    reweightor::reweightor(const lhe &mother, std::vector<procReweightor> rws, std::vector<iterator> iters) : lhe(mother), iterators(std::move(iters))
+    {
+        this->set_reweightors(rws);
+    }
+
+    reweightor &reweightor::set_reweightors(std::vector<std::shared_ptr<procReweightor>> rws)
+    {
+        this->reweightors = rws;
+        return *this;
+    }
+
+    reweightor &reweightor::set_reweightors(std::vector<procReweightor> rws)
+    {
+        this->reweightors.clear();
+        for (auto &rw : rws)
+        {
+            this->reweightors.push_back(std::make_shared<procReweightor>(std::move(rw)));
+        }
+        return *this;
+    }
+
+    reweightor &reweightor::add_reweightor(procReweightor &rw)
+    {
+        this->reweightors.push_back(std::make_shared<procReweightor>(rw));
+        return *this;
+    }
+
+    reweightor &reweightor::add_reweightor(procReweightor &&rw)
+    {
+        this->reweightors.push_back(std::make_shared<procReweightor>(std::move(rw)));
+        return *this;
+    }
+
+    reweightor &reweightor::add_reweightor(std::shared_ptr<procReweightor> rw)
+    {
+        this->reweightors.push_back(rw);
+        return *this;
+    }
+
+    reweightor &reweightor::set_initialise(iterator init)
+    {
+        this->initialise = init;
+        return *this;
+    }
+
+    reweightor &reweightor::set_finalise(iterator fin)
+    {
+        this->finalise = fin;
+        return *this;
+    }
+
+    reweightor &reweightor::set_iterators(const std::vector<iterator> &iters)
+    {
+        this->iterators = iters;
+        return *this;
+    }
+
+    reweightor &reweightor::add_iterator(const iterator &iter)
+    {
+        this->iterators.push_back(iter);
+        return *this;
+    }
+
+    reweightor &reweightor::add_iterator(iterator &&iter)
+    {
+        this->iterators.push_back(std::move(iter));
+        return *this;
+    }
+
+    reweightor &reweightor::set_launch_names(const std::vector<std::string> &names)
+    {
+        this->launch_names = names;
+        return *this;
+    }
+
+    reweightor &reweightor::add_launch_name(const std::string &name)
+    {
+        this->launch_names.push_back(name);
+        return *this;
+    }
+
+    void reweightor::calc_norm()
+    {
+        if (this->events.empty())
+            throw std::runtime_error("reweightor::calc_norm: no events loaded, cannot calculate norm");
+        this->norm_factor = 1.0;
+        if (std::abs(this->idWgt_) == 3)
+        {
+            this->norm_factor = std::accumulate(this->xSec_.begin(), this->xSec_.end(), 0.0);
+            this->norm_factor /= this->events.size();
+        }
+        else if (std::abs(this->idWgt_) == 4)
+        {
+            this->norm_factor = 1. / this->events.size();
+        }
+        else
+        {
+            if (std::abs(this->idWgt_) > 2 || this->idWgt_ == 0)
+                warning("reweightor::calc_norm: idWgt is not set to a value defined in the LHE standard. Assuming weighted events.");
+            this->norm_factor = std::accumulate(this->xSec_.begin(), this->xSec_.end(), 0.0);
+            double accumulated_wgts = 0.0;
+            for (const auto &proc : this->processes)
+            {
+                accumulated_wgts += std::accumulate(proc->weight_.begin(), proc->weight_.end(), 0.0);
+            }
+            if (accumulated_wgts == 0.0)
+            {
+                for (auto ev : this->events)
+                {
+                    accumulated_wgts += ev->weight_;
+                }
+            }
+            if (accumulated_wgts == 0.0)
+                throw std::runtime_error("reweightor::calc_norm: total weight is zero, cannot calculate norm");
+            this->norm_factor /= accumulated_wgts;
+        }
+    }
+
+    void reweightor::set_norm(double norm)
+    {
+        this->norm_factor = norm;
+    }
+
+    void reweightor::setup_pool()
+    {
+        if (!pool)
+        {
+            unsigned hc = std::thread::hardware_concurrency();
+            if (hc == 0)
+                hc = 1;
+            unsigned want = this->pool_threads ? this->pool_threads : hc;
+            unsigned n = static_cast<unsigned>(
+                std::max<size_t>(1, std::min<size_t>(reweightors.size(), want)));
+            this->pool = std::make_unique<threadPool>(n);
+        }
+    }
+
+    void reweightor::extract_sorter()
+    {
+        if (this->reweightors.empty())
+            throw std::runtime_error("reweightor::extract_sorter: no procReweightors set in reweightor");
+
+        std::vector<event_bool_fn> preds;
+        preds.reserve(this->reweightors.size());
+        for (const auto &rw : this->reweightors)
+        {
+            if (rw->event_checker_fn)
+            {
+                preds.push_back(rw->event_checker_fn);
+            }
+            else
+            {
+                preds.push_back(rw->event_checker->get_event_bool());
+            }
+        }
+
+        this->set_sorter(eventSorter(std::move(preds)));
+        this->sorted_events.clear();
+        this->processes.clear();
+        this->sort_events();
+        this->events_to_processes();
+
+        const size_t R = this->reweightors.size();
+        const size_t B = this->processes.size();
+        const bool has_unsorted = (B == R + 1);
+        auto processes_full = this->processes;
+
+        std::vector<size_t> keep;
+        keep.reserve(R);
+        for (size_t i = 0; i < R; ++i)
+            if (!processes_full[i]->events.empty())
+                keep.push_back(i);
+
+        std::vector<std::shared_ptr<process>> procs;
+        std::vector<std::shared_ptr<procReweightor>> rwgs;
+        procs.reserve(keep.size() + (has_unsorted ? 1u : 0u));
+        rwgs.reserve(keep.size() + (has_unsorted ? 1u : 0u));
+
+        for (size_t i : keep)
+        {
+            procs.push_back(processes_full[i]);
+            rwgs.push_back(this->reweightors[i]);
+        }
+
+        if (has_unsorted && !processes_full.back()->events.empty())
+        {
+            procs.push_back(processes_full.back());
+
+            auto dummy = std::make_shared<procReweightor>();
+            rwgs.push_back(std::move(dummy));
+        }
+
+        this->processes = std::move(procs);
+        this->reweightors = std::move(rwgs);
+
+        if (this->processes.size() != this->reweightors.size())
+            throw std::runtime_error("reweightor::extract_sorter: number of processes does not match number of reweightors.");
+
+        for (size_t i = 0; i < this->reweightors.size(); ++i)
+        {
+            auto &p = this->processes[i];
+            p->validate();
+            this->reweightors[i]->set_process(p);
+        }
+    }
+
+    void reweightor::initialise_reweightors()
+    {
+        if (this->reweightors.size() != this->processes.size())
+            throw std::runtime_error("initialise_reweightors: reweightors/processes size mismatch");
+
+        for (size_t i = 0; i < this->reweightors.size(); ++i)
+            this->reweightors[i]->initialise(this->processes[i]);
+    }
+
+    void reweightor::finalise_reweighting()
+    {
+        for (auto proc : this->processes)
+        {
+            proc->transpose_wgts();
+            proc->validate();
+        }
+        if (!this->finalise())
+            warning("reweightor::finalise_reweighting: finalise iterator returned false, something might have gone wrong. Validate output manually.");
+        if (this->launch_names.size() > 0)
+        {
+            this->extract_weight_ids();
+            size_t nWgts = this->weight_ids->size();
+            for (size_t i = 0; i < this->launch_names.size(); ++i)
+            {
+                std::string curr_name = (this->launch_names[i].empty()) ? "rwgt_" + std::to_string(i + nWgts + 1) : this->launch_names[i];
+                this->weight_ids->push_back(curr_name);
+            }
+        }
+        this->calc_xSecs();
+        this->calc_xErrs();
+    }
+
+    void reweightor::setup()
+    {
+        if (!this->initialise())
+            throw std::runtime_error("reweightor::setup: initialise iterator returned false, something went wrong.");
+        this->extract_sorter();
+        this->initialise_reweightors();
+        this->n_amps = 0;
+        for (auto &rwgt : this->reweightors)
+        {
+            size_t amps = rwgt->reweight_functions.size();
+            this->n_amps = std::max(this->n_amps, amps);
+        }
+
+        if (this->n_amps == 0)
+        {
+            throw std::runtime_error("reweightor::setup: no reweight functions found, something went wrong.");
+        }
+        this->setup_pool();
+    }
+
+    void reweightor::run_iteration()
+    {
+        if (!this->iterators[this->curr_iter]())
+            throw std::runtime_error("reweightor::run_iteration: iterator returned false, something went wrong in iteration " + std::to_string(this->curr_iter) + ".");
+        this->curr_iter++;
+
+        // Nothing to do?
+        const size_t N = this->reweightors.size();
+        if (N == 0 || this->n_amps == 0)
+            return;
+
+        // Ensure pool exists (persisted across iterations)
+        setup_pool();
+
+        // Parallel "reweightor" phase
+        pool->begin_batch();
+        for (size_t i = 0; i < N; ++i)
+        {
+            pool->enqueue([this, i]
+                          {
+                    // Early cancel check (best-effort)
+                    if (pool->cancel_requested()) return;
+        
+                    auto &rwgt = this->reweightors[i];
+                    for (size_t amp = 0; amp < this->n_amps; ++amp) {
+                        // Safe because each task owns a distinct rwgt
+                        rwgt->evaluate(amp);
+                        if (pool->cancel_requested()) return; // responsive cancellation
+                    } });
+        }
+        // Wait for completion (or rethrow first error)
+        pool->wait_batch();
+    }
+
+    void reweightor::run_all_iterations()
+    {
+        while (this->curr_iter < this->iterators.size())
+        {
+            this->run_iteration();
+            for (auto &rwgt : this->reweightors)
+            {
+                rwgt->append_backlog();
+            }
+#pragma optimize("", off)
+            std::cout << ".";
+#pragma optimize("", on)
+            std::cout.flush();
+        }
+    }
+
+    void reweightor::run()
+    {
+        this->setup();
+        this->run_all_iterations();
+        this->finalise_reweighting();
+    }
+
+    void reweightor::calc_xSecs()
+    {
+        if (this->norm_factor == 0.0)
+            this->calc_norm();
+        this->rwgt_xSec = std::vector<double>(this->events[0]->wgts_.size(), 0.0);
+        for (auto ev : this->events)
+        {
+            for (size_t i = 0; i < ev->wgts_.size(); ++i)
+            {
+                this->rwgt_xSec[i] += ev->wgts_[i];
+            }
+        }
+        for (auto &x : this->rwgt_xSec)
+            x *= this->norm_factor;
+    }
+
+    void reweightor::calc_xErrs()
+    {
+        if (this->rwgt_xSec.size() == 0)
+            this->calc_xSecs();
+        double loc_xSec = std::accumulate(this->xSec_.begin(), this->xSec_.end(), 0.0);
+        double loc_xErr = std::sqrt(std::accumulate(this->xSecErr_.begin(), this->xSecErr_.end(), 0.0, [](double a, double b)
+                                                    { return a + b * b; }));
+        size_t nEvs = this->events.size();
+        if (nEvs == 0)
+        {
+            this->transpose();
+            nEvs = this->events.size();
+            if (nEvs == 0)
+                throw std::runtime_error("reweightor::calc_xErrs: no events found, cannot calculate errors");
+        }
+        this->rwgt_xErr = std::vector<double>(this->rwgt_xSec.size(), 0.0);
+        auto omg = std::vector<double>(this->rwgt_xSec.size(), 0.0);
+        auto omgSq = std::vector<double>(this->rwgt_xSec.size(), 0.0);
+        for (auto ev : this->events)
+        {
+            for (size_t k = 0; k < ev->wgts_.size(); ++k)
+            {
+                double ratio = ev->wgts_[k] / ev->weight_;
+                omg[k] += ratio;
+                omgSq[k] += std::pow(ratio, 2);
+            }
+        }
+        double invNoEvs = 1.0 / double(nEvs);
+        double sqrtInvNoEvs = std::sqrt(invNoEvs);
+        for (size_t k = 0; k < this->rwgt_xSec.size(); ++k)
+        {
+            double variance = (omgSq[k] - std::pow(omg[k], 2) * invNoEvs) * invNoEvs;
+            variance = std::max(variance, 0.0);
+            this->rwgt_xErr[k] = loc_xSec * std::sqrt(variance) * sqrtInvNoEvs + loc_xErr * omg[k] * invNoEvs;
+            if (std::isnan(this->rwgt_xErr[k]) || std::isinf(this->rwgt_xErr[k]) || this->rwgt_xErr[k] <= 0.0)
+            {
+                warning("reweightor::calc_xErrs: Error propagation failed for weight " + std::to_string(k) + ". Approximating the error at the level of the cross section.");
+                this->rwgt_xErr[k] = loc_xErr * std::max(loc_xSec / this->rwgt_xSec[k], this->rwgt_xSec[k] / loc_xSec);
+            }
+        }
+    }
+
+    param_rwgt::param_rwgt(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws, const std::string &slha_path, const std::string &rwgt_path)
+        : reweightor(mother, rws)
+    {
+        this->read_slha_rwgt(slha_path, rwgt_path);
+    }
+
+    void param_rwgt::read_slha_rwgt(std::istream &slha_in, std::istream &rwgt_in)
+    {
+        this->card_iter = rwgt_slha::create(slha_in, rwgt_in);
+        this->initialise = [&]()
+        { return this->card_iter.move_param_card(); };
+        this->finalise = [&]()
+        { return this->card_iter.remove_mod_card(); };
+        this->iterators = this->card_iter.get_card_writers();
+        this->launch_names = this->card_iter.get_launch_names();
+        this->weight_context = this->card_iter.get_rwgt_commands();
+    }
+
+    void param_rwgt::read_slha_rwgt(const std::string &slha_file, const std::string &rwgt_file)
+    {
+        std::ifstream slha_in(slha_file);
+        std::ifstream rwgt_in(rwgt_file);
+        if (!slha_in || !rwgt_in)
+            throw std::runtime_error("param_rwgt::read_slha_rwgt: failed to open input files");
+        this->read_slha_rwgt(slha_in, rwgt_in);
+        this->card_iter.set_card_path(slha_file);
+    }
+
+} // namespace REX::tea
+#endif // _TEAREX_CPP_
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.h b/PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.h
new file mode 100644
index 0000000000..e467438413
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/teaRex.h
@@ -0,0 +1,283 @@
+/***
+ *     _            ______
+ *    | |           | ___ \
+ *    | |_ ___  __ _| |_/ /_____  __
+ *    | __/ _ \/ _` |    // _ \ \/ /
+ *    | ||  __/ (_| | |\ \  __/>  <
+ *     \__\___|\__,_\_| \_\___/_/\_\
+ *
+ ***/
+//
+// *t*ensorial *e*vent *a*daption with *R*e*x* Version 1.0.0
+// teaRex is an extension to the Rex library for the generic reweighting of parton-level events.
+// It provides a flexible framework for applying weight modifications to events based on user-defined criteria,
+// using the underlying Rex formats to sort, extract, and rewrite event-level information,
+// and extending it to allow for generic reweighting using any information stored in an LHE file as input for a
+// user-provided reweighting function acting on REX::process objects, which are SoA (Structure of Arrays)
+// objects for storing event information. Users can either provide the REX::process objects themselves,
+// or use the flexible Rex sorting architecture to extract the necessary information from an LHE file.
+//
+// Copyright © 2023-2025 CERN, CERN Author Zenny Wettersten.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// All rights not expressly granted are reserved.
+//
+
+#ifndef _TEAREX_H_
+#define _TEAREX_H_
+
+#include "Rex.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <exception>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+#include <filesystem>
+
+namespace REX::tea
+{
+    using eventBelongs = REX::eventBelongs;
+    using eventSorter = REX::eventSorter;
+    using lhe = REX::lhe;
+    using process = REX::process;
+    using slha = REX::slha;
+    using iterator = std::function<bool()>;
+    using weightor = std::function<std::shared_ptr<std::vector<double>>(process &)>;
+
+    bool true_function();
+
+    // Type for handling SLHA card modifictions using cards of the form
+    // # launch rwgt_name=OPTIONAL_NAME
+    // # set BLOCK_NAME PARAM_ID PARAM_VALUE
+    // # set BLOCK_NAME PARAM_ID PARAM_VALUE
+    // #
+    // # launch rwgt_name=OPTIONAL_NAME2...
+    struct rwgt_slha : public slha
+    {
+        // Default constructors
+        rwgt_slha() = default;
+        rwgt_slha(const rwgt_slha &) = default;
+        rwgt_slha(rwgt_slha &&) = default;
+        rwgt_slha &operator=(const rwgt_slha &) = default;
+        rwgt_slha &operator=(rwgt_slha &&) = default;
+        static rwgt_slha create(std::istream &slha_in, std::istream &rwgt_in)
+        {
+            rwgt_slha r;
+            r.read(slha_in);
+            r.parse_rwgt_card(rwgt_in);
+            return r;
+        }
+
+        static rwgt_slha create(const std::string &slha_path, const std::string &rwgt_path)
+        {
+            rwgt_slha r;
+            std::ifstream slha_in(slha_path);
+            std::ifstream rwgt_in(rwgt_path);
+            if (!slha_in || !rwgt_in)
+                throw std::runtime_error("rwgt_slha::create: failed to open input files");
+            r.card_path = slha_path;
+            r.read(slha_in);
+            r.parse_rwgt_card(rwgt_in);
+            return r;
+        }
+
+        struct rwgt_block
+        {
+            std::string name;
+            std::vector<std::pair<int, double>> params;
+        };
+
+        struct rwgt_card
+        {
+            std::string launch_name;
+            std::string rwgt_com = "";
+            std::unordered_map<std::string, rwgt_block> blocks;
+            void add_param(const std::string &block_name, std::pair<int, double> param);
+            void add_param(const std::string &block_name, int param_id, double param_value);
+        };
+
+        std::vector<rwgt_card> cards = {};
+
+        std::string card_path;
+        std::string mod_card_path;
+        rwgt_slha &set_card_path(const std::string &path);
+        rwgt_slha &set_mod_card_path(const std::string &path);
+
+        rwgt_card orig_params;
+
+        bool move_param_card(const std::string &new_path = "");
+        bool remove_mod_card();
+
+        void parse_rwgt_card(std::istream &is);
+        bool write_rwgt_card(size_t idx);
+        std::vector<std::function<bool()>> get_card_writers();
+        std::vector<std::string> get_launch_names();
+        std::vector<std::string> get_rwgt_commands();
+    };
+
+    class threadPool
+    {
+    public:
+        using Task = std::function<void()>;
+
+        explicit threadPool(unsigned nthreads);
+
+        ~threadPool();
+
+        void enqueue(Task t);
+        void begin_batch();
+        void wait_batch();
+
+        bool cancel_requested() const noexcept;
+
+    private:
+        std::vector<std::thread> workers_;
+        std::queue<Task> q_;
+        std::mutex m_;
+        std::condition_variable cv_;
+
+        std::atomic<bool> cancel_{false};
+        std::atomic<bool> stop_;
+        size_t active_;
+        std::condition_variable drained_;
+
+        std::mutex err_m_;
+        std::exception_ptr first_error_ = nullptr;
+    };
+
+    struct procReweightor
+    {
+        // Default constructors
+        procReweightor() = default;
+        procReweightor(const procReweightor &) = default;
+        procReweightor(procReweightor &&) = default;
+        procReweightor &operator=(const procReweightor &) = default;
+        procReweightor &operator=(procReweightor &&) = default;
+        // Explicit constructors wrt reweighting
+        procReweightor(weightor reweight_function);
+        procReweightor(weightor reweight_function, eventBelongs selector);
+        procReweightor(weightor reweight_function, std::shared_ptr<eventBelongs> selector);
+        procReweightor(std::vector<weightor> rwgts);
+        procReweightor(std::vector<weightor> rwgts, eventBelongs selector);
+        procReweightor(std::vector<weightor> rwgts, std::shared_ptr<eventBelongs> selector);
+        procReweightor(std::vector<weightor> rwgts, eventBelongs selector, weightor normaliser);
+        procReweightor(std::vector<weightor> rwgts, std::shared_ptr<eventBelongs> selector, weightor normaliser);
+
+        std::shared_ptr<eventBelongs> event_checker;
+        REX::event_bool_fn event_checker_fn = nullptr;
+        weightor normaliser = nullptr;
+        std::vector<weightor> reweight_functions = {};
+        std::vector<double> normalisation = {};
+        std::shared_ptr<process> proc = nullptr;
+        std::vector<std::vector<double>> backlog = {};
+
+        procReweightor &set_event_checker(eventBelongs checker);
+        procReweightor &set_event_checker(REX::event_bool_fn checker);
+        procReweightor &set_normaliser(weightor normaliser);
+        procReweightor &set_reweight_functions(weightor rwgt);
+        procReweightor &set_reweight_functions(std::vector<weightor> rwgts);
+        procReweightor &add_reweight_function(weightor rwgt);
+        procReweightor &set_process(std::shared_ptr<process> p);
+
+        // Member functions for handling reweighting
+        void initialise();
+        void initialise(std::shared_ptr<process> p);
+        void evaluate();
+        void evaluate(size_t amp);
+        void append_zero_weights();
+        void append_backlog();
+    };
+
+    // The reweightor object is an extension to REX::lhe
+    // with member functions for handling the details of reweighting
+    struct reweightor : public lhe
+    {
+        // Default constructors
+        reweightor() = default;
+        reweightor(const reweightor &) = default;
+        reweightor(reweightor &&) = default;
+        reweightor &operator=(const reweightor &) = default;
+        reweightor &operator=(reweightor &&) = default;
+        reweightor(lhe &&lhe);
+        reweightor(const lhe &lhe);
+        reweightor(lhe &&mother, std::vector<std::shared_ptr<procReweightor>> rws);
+        reweightor(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws);
+        reweightor(lhe &&mother, std::vector<std::shared_ptr<procReweightor>> rws, std::vector<iterator> iters);
+        reweightor(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws, std::vector<iterator> iters);
+        reweightor(lhe &&mother, std::vector<procReweightor> rws);
+        reweightor(const lhe &mother, std::vector<procReweightor> rws);
+        reweightor(lhe &&mother, std::vector<procReweightor> rws, std::vector<iterator> iters);
+        reweightor(const lhe &mother, std::vector<procReweightor> rws, std::vector<iterator> iters);
+
+        std::vector<std::shared_ptr<procReweightor>> reweightors;
+        iterator initialise = true_function;
+        iterator finalise = true_function;
+        std::vector<iterator> iterators = {true_function};
+        std::vector<std::string> launch_names = {};
+
+        size_t curr_iter = 0;
+        size_t n_amps = 0;
+
+        double norm_factor = 0.0;
+
+        void calc_norm();
+        void set_norm(double norm);
+
+        std::vector<double> rwgt_xSec = {};
+        std::vector<double> rwgt_xErr = {};
+
+        std::unique_ptr<threadPool> pool; // persistent worker pool
+        unsigned long pool_threads = 0;
+        void setup_pool();
+
+        reweightor &set_reweightors(std::vector<std::shared_ptr<procReweightor>> rws);
+        reweightor &set_reweightors(std::vector<procReweightor> rws);
+        reweightor &add_reweightor(std::shared_ptr<procReweightor> rw);
+        reweightor &add_reweightor(procReweightor &rw);
+        reweightor &add_reweightor(procReweightor &&rw);
+        reweightor &set_initialise(iterator init);
+        reweightor &set_finalise(iterator fin);
+        reweightor &set_iterators(const std::vector<iterator> &iters);
+        reweightor &add_iterator(const iterator &iter);
+        reweightor &add_iterator(iterator &&iter);
+        reweightor &set_launch_names(const std::vector<std::string> &names);
+        reweightor &add_launch_name(const std::string &name);
+
+        void extract_sorter();
+        void initialise_reweightors();
+        void finalise_reweighting();
+        void setup();
+        void run_iteration();
+        void run_all_iterations();
+        void run();
+
+        void calc_xSecs();
+        void calc_xErrs();
+    };
+
+    struct param_rwgt : public reweightor
+    {
+        param_rwgt() = default;
+        param_rwgt(const param_rwgt &) = default;
+        param_rwgt(param_rwgt &&) = default;
+        param_rwgt &operator=(const param_rwgt &) = default;
+        param_rwgt &operator=(param_rwgt &&) = default;
+
+        param_rwgt(const lhe &mother) : reweightor(mother) {};
+        param_rwgt(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws) : reweightor(mother, rws) {};
+
+        param_rwgt(const lhe &mother, std::vector<std::shared_ptr<procReweightor>> rws, const std::string &slha_path, const std::string &rwgt_path);
+
+        rwgt_slha card_iter;
+
+        void read_slha_rwgt(std::istream &slha_in, std::istream &rwgt_in);
+        void read_slha_rwgt(const std::string &slha_file, const std::string &rwgt_file);
+    };
+
+} // namespace REX::tea
+
+#endif // _TEAREX_H_
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_driver.inc b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_driver.inc
new file mode 100644
index 0000000000..2e16f1639e
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_driver.inc
@@ -0,0 +1,200 @@
+//==========================================================================
+// Copyright (C) 2023-2024 CERN
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Written by: Z. Wettersten (Jan 2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+//==========================================================================
+// This file has been automatically generated for C++ Standalone by
+%(info_lines)s
+//==========================================================================
+//==========================================================================
+// Driver for reweighting events for processes
+%(multiprocess_lines)s
+//--------------------------------------------------------------------------
+
+#include "rwgt_instance.h"
+#include <cstdlib>
+#include <typeinfo>
+#include <memory>
+%(include_lines)s
+
+    int usage(char *argv0, int ret = 1)
+{
+    std::cout << "Usage: " << argv0
+              << " [--lhefile=\"/YOUR/PATH/HERE\"|-lhe=\"/YOUR/PATH/HERE\"] [--rwgtcard=/YOUR/PATH/HERE|-rwgt=\"/YOUR/PATH/HERE\"]\n"
+              << "[--output=/YOUR/PATH/HERE\"|-out=\"/YOUR/PATH/HERE\"]\n"
+              << "[--param_card=/YOUR/PATH/HERE\"|-slha=\"/YOUR/PATH/HERE\"]\n";
+    std::cout << "\n";
+    std::cout << "The LHE file path should be with respect to the directory you are running\n";
+    std::cout << "this program from, and similarly the rwgt_card should be as well.\n";
+    return ret;
+}
+
+void writeRwgtCsv(std::string path, std::shared_ptr<std::vector<std::string>> names, std::shared_ptr<std::vector<double>> xSecs, std::shared_ptr<std::vector<double>> errXSecs)
+{
+    std::ofstream outFile;
+    outFile.open(path);
+    if (!outFile.is_open())
+        throw std::runtime_error("Failed to open output file for writing.");
+    if (names->size() != xSecs->size() || names->size() != errXSecs->size())
+        throw std::runtime_error("Mismatch in number of processes, cross-sections, and errors when logging results.");
+    for (size_t k = 0; k < names->size(); ++k)
+    {
+        outFile << names->at(k) << ", " << xSecs->at(k) << ", " << errXSecs->at(k) << "\n";
+    }
+    outFile.close();
+    return;
+}
+
+void writeRwgtCsv(std::string path, std::vector<std::string> names, std::vector<double> xSecs, std::vector<double> errXSecs)
+{
+    std::ofstream outFile;
+    outFile.open(path);
+    if (!outFile.is_open())
+        throw std::runtime_error("Failed to open output file for writing.");
+    if (names.size() != xSecs.size() || names.size() != errXSecs.size())
+        throw std::runtime_error("Mismatch in number of processes, cross-sections, and errors when logging results.");
+    for (size_t k = 0; k < names.size(); ++k)
+    {
+        outFile << names.at(k) << ", " << xSecs.at(k) << ", " << errXSecs.at(k) << "\n";
+    }
+    outFile.close();
+    return;
+}
+
+int main(int argc, char **argv)
+{
+
+    std::string banner =
+        "#################################################\n"
+        "#    ___  ___          _ _  ______              #\n"
+        "#    |  \\/  |         | | | | ___ \\             #\n"
+        "#    | .  . | __ _  __| | |_| |_/ /_____  __    #\n"
+        "#    | |\\/| |/ _` |/ _` | __|    // _ \\ \\/ /    #\n"
+        "#    | |  | | (_| | (_| | |_| |\\ \\  __/>  <     #\n"
+        "#    \\_|  |_/\\__,_|\\__,_|\\__\\_| \\_\\___/_/\\_\\    #\n"
+        "#                                               #\n"
+        "#        Data-parallel event reweighting        #\n"
+        "#             in MadGraph5_aMC@NLO              #\n"
+        "#                                               #\n"
+        "#################################################\n";
+
+    std::cout << banner;
+    std::cout << "Starting MadtRex driver...\n";
+    std::string lheFilePath;
+    std::string rwgtCardPath;
+    std::string outputPath;
+    std::string slhaPath;
+    size_t nb_threads = 1;
+    size_t batch_size = 32;
+
+    if (argc < 2)
+    {
+        return usage(argv[0]);
+    }
+
+    for (int i = 1; i < argc; i++)
+    {
+        auto currArg = std::string(argv[i]);
+        if (currArg.substr(0, 9) == "--lhefile" || currArg.substr(0, 4) == "-lhe")
+        {
+            lheFilePath = currArg.substr(currArg.find("=") + 1);
+        }
+        else if (currArg.substr(0, 10) == "--rwgtcard" || currArg.substr(0, 5) == "-rwgt")
+        {
+            rwgtCardPath = currArg.substr(currArg.find("=") + 1);
+        }
+        else if (currArg.substr(0, 8) == "--output" || currArg.substr(0, 4) == "-out")
+        {
+            outputPath = currArg.substr(currArg.find("=") + 1);
+        }
+        else if (currArg.substr(0, 12) == "--param_card" || currArg.substr(0, 5) == "-slha")
+        {
+            slhaPath = currArg.substr(currArg.find("=") + 1);
+        }
+        else if (currArg.substr(0, 10) == "--nb_threads" || currArg.substr(0, 3) == "-nt")
+        {
+            nb_threads = std::stoi(currArg.substr(currArg.find("=") + 1));
+        }
+        else if (currArg.substr(0, 12) == "--batch_size" || currArg.substr(0, 5) == "-warp")
+        {
+            batch_size = std::stoi(currArg.substr(currArg.find("=") + 1));
+        }
+        else
+        {
+            return usage(argv[0]);
+        }
+    }
+
+    if (lheFilePath.empty() || rwgtCardPath.empty())
+    {
+        return usage(argv[0]);
+    }
+
+    std::string currPath = argv[0];
+
+    size_t slashPos = currPath.find_last_of("/");
+    bool onWindows = false;
+    if (slashPos == std::string::npos)
+    {
+        slashPos = currPath.find_last_of("\\");
+        onWindows = true;
+    }
+    if (slashPos == std::string::npos)
+        throw std::runtime_error("Failed to determine current working directory -- need to know where program is run from to identify where to pull and push param_card.dat.");
+
+    if (slhaPath.empty())
+    {
+        if (onWindows)
+        {
+            if (currPath.substr(currPath.find_last_of("\\", slashPos - 1) + 1, 2) == "P1")
+            {
+                slhaPath = "..\\..\\Cards\\param_card.dat";
+            }
+            else if (currPath.substr(currPath.find_last_of("\\", slashPos - 1) + 1, 3) == "Sub")
+            {
+                slhaPath = "..\\Cards\\param_card.dat";
+            }
+            else
+            {
+                slhaPath = "\\Cards\\param_card.dat";
+            }
+        }
+        else
+        {
+            if (currPath.substr(currPath.find_last_of("/", slashPos - 1) + 1, 2) == "P1")
+            {
+                slhaPath = "../../Cards/param_card.dat";
+            }
+            else if (currPath.substr(currPath.find_last_of("/", slashPos - 1) + 1, 3) == "Sub")
+            {
+                slhaPath = "../Cards/param_card.dat";
+            }
+            else
+            {
+                slhaPath = "/Cards/param_card.dat";
+            }
+        }
+    }
+
+    static std::vector<std::shared_ptr<REX::tea::procReweightor>> rwgtRun = {%(make_rwgt)s};
+
+    auto rwgt_runner = REX::tea::param_rwgt(REX::load_lhef(lheFilePath), rwgtRun);
+    rwgt_runner.read_slha_rwgt(slhaPath, rwgtCardPath);
+    rwgt_runner.pool_threads = nb_threads;
+
+    rwgt_runner.run();
+
+    std::cout << "\nReweighting procedure finished.\n";
+
+    std::ofstream lhe_out(outputPath);
+    if (!lhe_out)
+        throw std::runtime_error("Failed to open output LHE file for writing.");
+    rwgt_runner.print(lhe_out, true);
+
+    std::cout << "Reweighted LHE file written to " << outputPath << ".\n";
+
+    writeRwgtCsv("rwgt_results.csv", *rwgt_runner.weight_ids, rwgt_runner.rwgt_xSec, rwgt_runner.rwgt_xErr);
+
+    return 0;
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.cc b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.cc
new file mode 100644
index 0000000000..8463cfbc30
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.cc
@@ -0,0 +1,176 @@
+//==========================================================================
+// Copyright (C) 2023-2025 CERN
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Written by: Z. Wettersten (Jan 2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+//==========================================================================
+// Library including generic functions and classes for event reweighting.
+// Process-specific rwgt_runner files are generated by mg5amc@nlo and use
+// this library, while the rwgt_driver file is a wrapping program that
+// calls the process-specific runners for given subprocesses.
+//==========================================================================
+
+#ifndef _RWGT_INSTANCE_CC_
+#define _RWGT_INSTANCE_CC_
+
+#include "rwgt_instance.h"
+
+namespace rwgt
+{
+
+    // ZW: Function for calculating the number of remaining events in a warp
+    //  in order to pad the input arrays to a multiple of the warp size
+    unsigned int warpRemain(unsigned int nEvt, unsigned int nWarp)
+    {
+        return (nWarp - (nEvt % nWarp)) % nWarp;
+    }
+
+    // ZW: Function for padding the input arrays to a multiple of the warp size
+    template <typename T>
+    void warpPad(std::vector<T> &input, unsigned int nWarp = 32)
+    {
+        auto nEvt = input.size();
+        auto nWarpRemain = warpRemain(nEvt, nWarp);
+        input.reserve(nEvt + nWarpRemain);
+        for (size_t k = nEvt - nWarpRemain; k < nEvt; ++k)
+        {
+            input.push_back(input[k]);
+        }
+        return;
+    }
+
+    fBridge::fBridge(REX::process &proc, unsigned int warpSize)
+    {
+        this->nPar = proc.events[0]->view().size();
+        this->nEvt = proc.events.size();
+        this->nWarp = warpSize;
+        this->nWarpRemain = warpRemain(nEvt, nWarp);
+        this->fauxNEvt = nEvt + nWarpRemain;
+        this->gs = proc.gS();
+        this->rndHel = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->rndCol = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->selHel = std::vector<int>(fauxNEvt, 0.);
+        this->selCol = std::vector<int>(fauxNEvt, 0.);
+        this->goodHel = false;
+    }
+
+    fBridge::fBridge(std::shared_ptr<REX::process> proc, unsigned int warpSize)
+    {
+        if (proc == nullptr)
+        {
+            throw std::runtime_error("fBridge: Provided process is null or contains no subprocesses.");
+        }
+        // Assuming all subprocesses have the same number of particles
+        this->nPar = proc->events[0]->view().size();
+        this->nEvt = proc->events.size();
+        this->nWarp = warpSize;
+        this->nWarpRemain = warpRemain(nEvt, nWarp);
+        this->fauxNEvt = nEvt + nWarpRemain;
+        this->gs = proc->gS();
+        this->rndHel = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->rndCol = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->selHel = std::vector<int>(fauxNEvt, 0.);
+        this->selCol = std::vector<int>(fauxNEvt, 0.);
+        this->goodHel = false;
+    }
+
+    void fBridge::bridgeSetup(unsigned int &noEvts, unsigned int warpSize)
+    {
+        this->nEvt = noEvts;
+        this->nWarp = warpSize;
+        this->nWarpRemain = warpRemain(nEvt, nWarp);
+        this->fauxNEvt = nEvt + nWarpRemain;
+        this->rndHel = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->rndCol = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->selHel = std::vector<int>(fauxNEvt, 0.);
+        this->selCol = std::vector<int>(fauxNEvt, 0.);
+    }
+    void fBridge::bridgeSetup(std::vector<FORTRANFPTYPE> &evVec, unsigned int warpSize)
+    {
+        this->nEvt = evVec.size();
+        this->nWarp = warpSize;
+        this->nWarpRemain = warpRemain(nEvt, nWarp);
+        this->fauxNEvt = nEvt + nWarpRemain;
+        this->rndHel = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->rndCol = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->selHel = std::vector<int>(fauxNEvt, 0.);
+        this->selCol = std::vector<int>(fauxNEvt, 0.);
+    }
+    void fBridge::bridgeSetup(std::shared_ptr<std::vector<FORTRANFPTYPE>> &evVec, unsigned int warpSize)
+    {
+        this->bridgeSetup(*evVec, warpSize);
+    }
+
+    void fBridge::bridgeSetup(REX::process &proc, unsigned int warpSize)
+    {
+        this->nPar = proc.events[0]->view().size();
+        this->nEvt = proc.events.size();
+        this->nWarp = warpSize;
+        this->nWarpRemain = warpRemain(nEvt, nWarp);
+        this->fauxNEvt = nEvt + nWarpRemain;
+        this->gs = proc.gS();
+        this->rndHel = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->rndCol = std::vector<FORTRANFPTYPE>(fauxNEvt, 0.);
+        this->selHel = std::vector<int>(fauxNEvt, 0.);
+        this->selCol = std::vector<int>(fauxNEvt, 0.);
+        this->goodHel = false;
+    }
+
+    void fBridge::bridgeSetup(std::shared_ptr<REX::process> proc, unsigned int warpSize)
+    {
+        this->bridgeSetup(*proc, warpSize);
+    }
+
+    void fBridge::setBridge(bridgeWrapper amp)
+    {
+        if (this->bridge == nullptr)
+        {
+            this->bridge = amp;
+        }
+        else
+            throw std::runtime_error("fBridge object doubly defined.");
+    }
+    std::shared_ptr<std::vector<FORTRANFPTYPE>> fBridge::bridgeCall(std::vector<FORTRANFPTYPE> &momenta)
+    {
+        if (this->nEvt == 0)
+            this->bridgeSetup(this->gs);
+        if (this->bridge == nullptr)
+            throw std::runtime_error("fBridge object not defined.");
+        if (this->gs.size() != fauxNEvt)
+            warpPad(gs, nWarp);
+        warpPad(momenta, nWarp * nPar * nMom);
+        auto evalScatAmps = this->bridge(
+            fauxNEvt, nPar, nMom,
+            momenta, gs, rndHel, rndCol,
+            selHel, selCol, chanId, goodHel);
+        momenta.resize(nEvt * nPar * nMom);
+        evalScatAmps->resize(nEvt);
+        return evalScatAmps;
+    }
+
+    std::shared_ptr<std::vector<FORTRANFPTYPE>> fBridge::bridgeCall(REX::process &proc)
+    {
+        if (this->nEvt == 0)
+            this->bridgeSetup(proc);
+        return this->bridgeCall(proc.momenta_.flat_vector());
+    }
+
+    std::shared_ptr<std::vector<FORTRANFPTYPE>> fBridge::bridgeCall(std::shared_ptr<REX::process> proc)
+    {
+        return this->bridgeCall(proc->momenta_.flat_vector());
+    }
+
+    REX::tea::weightor fBridge::getAmp()
+    {
+        if (this->bridge == nullptr)
+            throw std::runtime_error("fBridge object not defined.");
+        REX::tea::weightor amp = [this](REX::process &process)
+        {
+            return this->bridgeCall(process);
+        };
+        return amp;
+    }
+
+}
+
+#endif
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.h b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.h
new file mode 100644
index 0000000000..30e1bcd402
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_instance.h
@@ -0,0 +1,74 @@
+//==========================================================================
+// Copyright (C) 2023-2025 CERN
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Written by: Z. Wettersten (Jan 2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+//==========================================================================
+// Library including generic functions and classes for event reweighting.
+// Process-specific rwgt_runner files are generated by mg5amc@nlo and use
+// this library, while the rwgt_driver file is a wrapping program that
+// calls the process-specific runners for given subprocesses.
+//==========================================================================
+
+#ifndef _RWGT_INSTANCE_H_
+#define _RWGT_INSTANCE_H_
+
+#include "teaRex.h"
+
+/**
+ * The floating point precision used in Fortran arrays.
+ * This is presently hardcoded to double precision (REAL*8).
+ */
+using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays
+// using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays
+
+namespace rwgt
+{
+
+    // ZW: Function for calculating the number of remaining events in a warp
+    //  in order to pad the input arrays to a multiple of the warp size
+    unsigned int warpRemain(unsigned int nEvt, unsigned int nWarp = 32);
+
+    // ZW: bridgeWrapper needs args: nEvs, nPar, nMom, moms, gs, rndhel, rndcol, selhel, selcol, chanId
+    using bridgeWrapper = std::function<std::shared_ptr<std::vector<FORTRANFPTYPE>>(int &, int &, int &, std::vector<FORTRANFPTYPE> &, std::vector<FORTRANFPTYPE> &, std::vector<FORTRANFPTYPE> &, std::vector<FORTRANFPTYPE> &, std::vector<int> &, std::vector<int> &, unsigned int &, bool &)>;
+
+    struct fBridge
+    {
+        // Default constructors
+        fBridge() = default;
+        fBridge(const fBridge &source) = default;
+        fBridge(fBridge &&source) = default;
+
+        std::vector<FORTRANFPTYPE> rndHel = {};
+        std::vector<FORTRANFPTYPE> rndCol = {};
+        std::vector<FORTRANFPTYPE> gs = {};
+        std::vector<int> selHel = {};
+        std::vector<int> selCol = {};
+        unsigned int chanId = 0;
+        int nMom = 4;
+        int nWarp;
+        int nWarpRemain;
+        int nEvt;
+        int fauxNEvt;
+        int nPar;
+        bool goodHel = false;
+        bridgeWrapper bridge;
+        fBridge(REX::process &proc, unsigned int warpSize = 32);
+        fBridge(std::shared_ptr<REX::process> proc, unsigned int warpSize = 32);
+        void init(std::vector<REX::event> &process, unsigned int warpSize = 32);
+        void init(std::vector<std::shared_ptr<REX::event>> process, unsigned int warpSize = 32);
+        void bridgeSetup(unsigned int &noEvts, unsigned int warpSize = 32);
+        void bridgeSetup(std::vector<FORTRANFPTYPE> &evVec, unsigned int warpSize = 32);
+        void bridgeSetup(std::shared_ptr<std::vector<FORTRANFPTYPE>> &evVec, unsigned int warpSize = 32);
+        void bridgeSetup(REX::process &proc, unsigned int warpSize = 32);
+        void bridgeSetup(std::shared_ptr<REX::process> proc, unsigned int warpSize = 32);
+        void setBridge(bridgeWrapper amp);
+        std::shared_ptr<std::vector<FORTRANFPTYPE>> bridgeCall(std::vector<FORTRANFPTYPE> &momenta);
+        std::shared_ptr<std::vector<FORTRANFPTYPE>> bridgeCall(REX::process &process);
+        std::shared_ptr<std::vector<FORTRANFPTYPE>> bridgeCall(std::shared_ptr<REX::process> process);
+        REX::tea::weightor getAmp();
+    };
+
+}
+
+#endif
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_cc.inc b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_cc.inc
new file mode 100644
index 0000000000..86329abc0c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_cc.inc
@@ -0,0 +1,64 @@
+//==========================================================================
+// Copyright (C) 2023-2025 CERN
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Written by: Z. Wettersten (Jan 2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+//==========================================================================
+// This file has been automatically generated for the CUDACPP plugin by
+%(info_lines)s
+//==========================================================================
+//==========================================================================
+// A class for reweighting matrix elements for
+%(process_lines)s
+//--------------------------------------------------------------------------
+#ifndef _TREX_
+#define _TREX_
+#endif
+#include "rwgt_instance.h"
+#include "fbridge.h"
+
+namespace %(process_namespace)s
+{
+
+    std::shared_ptr<REX::eventBelongs> get_comp()
+    {
+        static std::vector<std::vector<short int>> status = {%(parton_status)s};
+        static std::vector<std::vector<long int>> pdg = {%(parton_ids)s};
+        if (status.size() != pdg.size())
+            throw std::runtime_error("Inconsistent event data in rwgt_runner");
+        static std::vector<std::shared_ptr<REX::event>> loc_evs;
+        for (size_t i = 0; i < status.size(); ++i)
+        {
+            auto ev = std::make_shared<REX::event>(pdg[i].size());
+            ev->set_status(status[i]);
+            ev->set_pdg(pdg[i]);
+            loc_evs.push_back(ev);
+        }
+        return std::make_shared<REX::eventBelongs>(loc_evs, REX::external_legs_comparator);
+    }
+
+    std::shared_ptr<std::vector<FORTRANFPTYPE>> amp(int &nEvt, int &nPar, int &nMom, std::vector<FORTRANFPTYPE> &momenta, std::vector<FORTRANFPTYPE> &alphaS, std::vector<FORTRANFPTYPE> &rndHel, std::vector<FORTRANFPTYPE> &rndCol, std::vector<int> &selHel, std::vector<int> &selCol, unsigned int &chanId, bool &goodHel)
+    {
+        CppObjectInFortran *bridgeInst;
+        auto evalScatAmps = std::make_shared<std::vector<FORTRANFPTYPE>>(nEvt);
+        fbridgecreate_(&bridgeInst, &nEvt, &nPar, &nMom);
+        fbridgesequence_nomultichannel_(&bridgeInst, &momenta.at(0), &alphaS.at(0), &rndHel[0], &rndCol[0], &evalScatAmps->at(0), &selHel[0], &selCol[0], &goodHel);
+        fbridgedelete_(&bridgeInst);
+        return evalScatAmps;
+    }
+
+    rwgt::fBridge bridgeConstr(size_t warpSize)
+    {
+        rwgt::fBridge bridge;
+        bridge.setBridge(amp);
+        bridge.nWarp = warpSize;
+        return bridge;
+    }
+
+    std::shared_ptr<REX::tea::procReweightor> make_reweightor(size_t warpSize)
+    {
+        static rwgt::fBridge bridge = bridgeConstr(warpSize);
+        auto comp = get_comp();
+        return std::make_shared<REX::tea::procReweightor>(bridge.getAmp(), comp);
+    }
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_h.inc b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_h.inc
new file mode 100644
index 0000000000..b990de7a00
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/MadtRex/template_files/rwgt_runner_h.inc
@@ -0,0 +1,31 @@
+//==========================================================================
+// Copyright (C) 2023-2024 CERN
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Written by: Z. Wettersten (June 2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+//==========================================================================
+// This file has been automatically generated for the CUDACPP plugin by
+%(info_lines)s
+//==========================================================================
+//==========================================================================
+// A class for reweighting matrix elements for
+%(process_lines)s
+//--------------------------------------------------------------------------
+
+#ifndef _%(process_namespace)s_RUNNER_H_
+#define _%(process_namespace)s_RUNNER_H_
+
+#include "rwgt_instance.h"
+
+namespace %(process_namespace)s {
+
+    REX::eventBelongs get_comp();
+    std::shared_ptr<std::vector<FORTRANFPTYPE>> amp( int& nEvt, int& nPar, int& nMom, std::vector<FORTRANFPTYPE>& momenta, std::vector<FORTRANFPTYPE>& alphaS, std::vector<FORTRANFPTYPE>& rndHel, std::vector<FORTRANFPTYPE>& rndCol, std::vector<int>& selHel, std::vector<int>& selCol, int& chanId, bool& goodHel );
+    rwgt::fBridge bridgeConstr(size_t warpSize = 32);
+    std::shared_ptr<REX::tea::procReweightor> make_reweightor(size_t warpSize = 32);
+
+}
+
+
+
+#endif
diff --git a/PLUGIN/CUDACPP_OUTPUT/__init__.py b/PLUGIN/CUDACPP_OUTPUT/__init__.py
new file mode 100644
index 0000000000..381a7805fb
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/__init__.py
@@ -0,0 +1,81 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
+# Further modified by: D. Massaro, O. Mattelaer, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
+
+# AV - Rename the plugin as CUDACPP_OUTPUT (even if the madgraph4gpu directory is still called CUDACPP_SA_OUTPUT)
+# This can be used in mg5amcnlo in one of two ways:
+# 1. production mode: a tarball containing CUDACPP_OUTPUT is untarred in PLUGIN/CUDACPP_OUTPUT
+# 2. developers mode: the madgraph4gpu CUDACPP_SA_OUTPUT is symlinked in MG5aMC_PLUGIN/CUDACPP_OUTPUT (PYTHONPATH=.. ./bin/mg5_aMC -m CUDACPP_OUTPUT)
+PLUGIN_NAME = __name__ # PLUGIN_NAME can be one of PLUGIN/CUDACPP_OUTPUT or MG5aMC_PLUGIN/CUDACPP_OUTPUT
+print('Loading plugin %s'%PLUGIN_NAME)
+
+# AV - Require Python >= 3.8 to ensure that {} dictionaries preserve the order of item insertion
+# (note: python3.7 would probably be enough but this plugin has only been tested using python3.8)
+import sys
+minpython = (3,8)
+if sys.version_info < minpython :
+
+    print('ERROR! Cannot load plugin %s: Python >= %s.%s is required' % (PLUGIN_NAME, minpython[0], minpython[1] ))
+
+else:
+
+    # Import the required files
+    # Example: import maddm_interface as maddm_interface # local file
+    #          import madgraph.various.cluster as cluster # MG5 distribution file
+
+    # Three types of functionality are allowed in a plugin
+    #   1. new output mode
+    #   2. new cluster support
+    #   3. new interface
+
+    # 1. Define new output mode.
+    #    Example: new_output = {'myformat': MYCLASS}
+    #    allows the command "output myformat PATH" in madgraph.
+    #    MYCLASS should inherit from class madgraph.iolibs.export_v4.VirtualExporter
+    ###import PLUGIN.CUDACPP_OUTPUT.output as output # AV modify this to also allow MG5aMC_PLUGIN
+    __import__('%s.output'%PLUGIN_NAME)
+    output = sys.modules['%s.output'%PLUGIN_NAME]
+    __import__('%s.trex'%PLUGIN_NAME)
+    trex = sys.modules['%s.trex'%PLUGIN_NAME]
+    new_output = { 'madevent_simd' : output.SIMD_ProcessExporter,
+                   'madevent_gpu' : output.GPU_ProcessExporter,
+                   'standalone_cudacpp' : output.PLUGIN_ProcessExporter,
+                   'standalone_trex' : trex.TREX_ProcessExporter,
+                   # the following one are used for the second exporter class 
+                   # (not really needed so far but interesting if need
+                   #  specialization in the futur) 
+                   'standalone_simd' :  output.SIMD_ProcessExporter,
+                   'standalone_cuda' :  output.GPU_ProcessExporter,
+                  }
+    new_reweight = {'madtrex': trex.TREX_ReweightInterface}
+
+    # 2. Define new way to handle the cluster.
+    #    Example: new_cluster = {'mycluster': MYCLUSTERCLASS}
+    #    allows the command "set cluster_type mycluster" in madgraph
+    #    MYCLUSTERCLASS should inherit from class madgraph.various.cluster.Cluster.
+    new_cluster = {}
+
+    # 3. Define a new interface (allows adding/modifying MG5 command).
+    #    This can be activated via ./bin/mg5_aMC --mode=PLUGINNAME.
+    #    Put None if no dedicated command are required
+    if PLUGIN_NAME.rsplit('.',1)[0] == 'MG5aMC_PLUGIN':
+        import madgraph.interface.master_interface as interface
+        new_interface = interface.MasterCmd # use the default interface (but this is needed in the '-m' aka '--mode' option)
+    else:
+        new_interface = None
+
+    ########################## CONTROL VARIABLE ####################################
+
+    __author__ = 'Andrea Valassi'
+    __email__ = 'andrea.valassi@cern.ch'
+
+    # Plugin version (major,minor,patch) where major>1, 0<=minor<=99 and 0<=patch<=99
+    # The release infrastructure expects 'vN.NN.NN' tags with 1-digit major and 2-digit minor and patch versions
+    # and it takes care of converting the python tuple '(1,0,1)' into a version string 'v1.00.01'
+    # NB! Do not use '(1,00,01)' here: leading zeros in decimal integer literals are not permitted in python (#1013)
+    __version__ = (1,1,1)
+
+    minimal_mg5amcnlo_version = (3,6,4)
+    maximal_mg5amcnlo_version = (1000,1000,1000)
+    latest_validated_version = (3,6,5)
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/create_acceptance_from_file.py b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/create_acceptance_from_file.py
new file mode 100644
index 0000000000..f012a3ca63
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/create_acceptance_from_file.py
@@ -0,0 +1,132 @@
+####
+## Automatic creation of CI/CD for the plugin repo
+## and automatic test in madgraph format style
+## test can be added easily by adding a file in the directory
+##  simple_cross_check
+## format of the file is the cmd file to pass to the code
+## two type of additional metadata line are present
+## #title provide the documentation information for the unittest (docstring)
+## #check RUN_NAME CROSS ERROR NB_EVENT provide the target cross-section/error/requested event
+
+
+import os
+import glob
+
+curr_dir = os.path.dirname(__file__)
+pjoin = os.path.join
+template_runtest = """
+# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template
+    def test_%(name)s(self):
+        \"\"\"%(doc)s\"\"\"
+
+        if logging.getLogger('madgraph').level <= 20:
+            stdout=None
+            stderr=None
+        else:
+            devnull =open(os.devnull,'w')
+            stdout=devnull
+            stderr=devnull
+            
+        try:
+            shutil.rmtree('/tmp/MGPROCESS/')
+        except Exception as error:
+            pass
+        
+        cmd = \"\"\"%(cmd)s
+                 \"\"\" %%self.run_dir
+
+        open(pjoin(self.path, 'mg5_cmd'),'w').write(cmd)
+        newenv = os.environ.copy()
+        newenv[\"PYTHONPATH\"] = pjoin(MG5DIR, '..')
+        subprocess.call([sys.executable, pjoin(MG5DIR, 'bin','mg5_aMC'),'-m','CUDACPP_OUTPUT', 
+                         pjoin(self.path, 'mg5_cmd')], env=newenv,
+                         #cwd=self.path,
+                         stdout=stdout, stderr=stderr)
+"""
+
+template_onecheck = """
+        self.check_parton_output(cross=%(cross)s, error=%(err)s, run_name='%(run_name)s', html=%(html)s)
+        event = '%%s/Events/%(run_name)s/unweighted_events.lhe' %% self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, %(nb_event)s)"""
+
+
+def create_test_simd_madevent():
+
+    text = "# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template\n\n\n"
+    text += open(pjoin(curr_dir, 'test_simd_madevent.template'),'r').read()
+
+    for filename in sorted(os.listdir(pjoin(curr_dir, 'simple_cross_check'))):
+        
+        opt = {}
+        opt['name'] = filename
+        opt['cmd'] = open(pjoin(curr_dir, 'simple_cross_check',filename)).read()
+        opt['doc'] = "\n".join([line[5:] for line in opt['cmd'].split('\n') if line.startswith('#title')])
+
+        text += template_runtest % opt
+
+        checks = [line.split() for line in opt['cmd'].split('\n') if line.startswith('#check')]
+                  
+        for i, check in enumerate(checks):
+            _, name, cross, err, nb_event = check
+            opt['run_name'] = name
+            opt['cross'] = cross
+            opt['err'] = err
+            opt['nb_event'] = nb_event
+            opt['html'] = 'True' if i==0 else 'False'
+        
+            text += template_onecheck % opt
+
+    open(pjoin(curr_dir, 'test_simd_madevent.py'), 'w').write(text)
+
+template_one_cicd="""
+  %(name)s:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'true'
+      
+      # Runs a set of commands using the runners shell
+      - name: test one of the test testIO_AcceptanceProcOutputIOTests
+        run: |
+            cd $GITHUB_WORKSPACE
+            cd MG5aMC/mg5amcnlo/
+            cp input/.mg5_configuration_default.txt input/mg5_configuration.txt
+            cp Template/LO/Source/.make_opts Template/LO/Source/make_opts
+            if [ -f tests/cudacpp_acceptance_tests ]; then echo 'ERROR! tests/cudacpp_acceptance_tests already exists'; exit 1; fi # should never happen
+            ln -sf ../../MG5aMC_PLUGIN/CUDACPP_OUTPUT/acceptance_tests tests/cudacpp_acceptance_tests # workaround for 'relative position not supported'
+            ./tests/test_manager.py -p./tests/cudacpp_acceptance_tests/ test_%(name)s 
+            rm -f tests/cudacpp_acceptance_tests
+"""
+
+def create_cicd():
+
+    text = "# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template\n\n\n"
+    text += open(pjoin(curr_dir, 'madgraph_launch.template'),'r').read()
+ 
+    for filename in sorted(os.listdir(pjoin(curr_dir, 'simple_cross_check'))): 
+        text += template_one_cicd % {'name': filename}
+    
+    GITDIR =pjoin(os.path.realpath(curr_dir), os.path.pardir,os.path.pardir,os.path.pardir,os.path.pardir,os.path.pardir,os.path.pardir)
+    GITDIR = os.path.realpath(GITDIR) 
+    open(pjoin(GITDIR, '.github', 'workflows', 'madgraph_launch_test.yml'), 'w').write(text)
+
+
+if __name__ == '__main__':
+    create_test_simd_madevent()
+    create_cicd()
+
+
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/madgraph_launch.template b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/madgraph_launch.template
new file mode 100644
index 0000000000..7a8a08aba9
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/madgraph_launch.template
@@ -0,0 +1,29 @@
+# This is a basic workflow to help you get started with Actions
+
+name: running acceptance test
+# Controls when the workflow will run
+#    branches: [ main LTS ]
+on:
+  # Triggers the workflow on push or pull request events but only for the 3.4.0 branch
+  push:
+    paths-ignore:
+      - 'docs/**'
+#      - '.github/**'
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+    inputs:
+      message:
+        description: 'running acceptance test'     
+        required: true
+
+env:
+  commitmsg: ${{ github.event.head_commit.message }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_eemumua_float b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_eemumua_float
new file mode 100644
index 0000000000..5b63dd1bff
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_eemumua_float
@@ -0,0 +1,10 @@
+#title check eemumua in single precision
+import model sm
+                 set automatic_html_opening False --no_save
+                 set notification_center False --no_save
+                 generate e+ e- > mu+ mu- a
+                 output madevent_simd %s -f -nojpeg
+                 launch
+                 set nevents 100
+                 set floating_type f
+#check run_01 0.0266 0.0002854 100
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_heft_ggh_double b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_heft_ggh_double
new file mode 100644
index 0000000000..21ea1875b0
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_heft_ggh_double
@@ -0,0 +1,10 @@
+#title testing a HEFT process gg>aa in double precision
+import model heft
+                 set automatic_html_opening False --no_save
+                 set notification_center False --no_save
+                 generate g g > h > a a
+                 output madevent_simd %s -f -nojpeg
+                 launch
+                 set nevents 100
+                 set floating_type d
+#check run_01 0.01859 0.0002853789088650386 100
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_pptt_mixed b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_pptt_mixed
new file mode 100644
index 0000000000..8cd87df8f1
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_pptt_mixed
@@ -0,0 +1,10 @@
+#title check ggtt within mixed mode
+import model sm
+                 set automatic_html_opening False --no_save
+                 set notification_center False --no_save
+                 generate p p > t t~
+                 output madevent_simd %s -f -nojpeg
+                 launch
+                 set nevents 100
+                 set floating_type m
+#check run_01 505.5 2.749 100
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_vector_size b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_vector_size
new file mode 100644
index 0000000000..10396147fd
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/simple_cross_check/simd_cpp_vector_size
@@ -0,0 +1,17 @@
+#title: check that multiple vector size returns the same value
+import model sm
+set automatic_html_opening False --no_save
+set notification_center False --no_save
+generate p p > t t~
+output madevent_simd %s -f -nojpeg
+launch  
+set nevents 100
+set floating_type m
+set vector_size 16
+launch
+set vector_size 32
+launch
+set vector_size 64
+#check run_01 505.5 2.749 100
+#check run_02 505.5 2.749 100 
+#check run_03 505.5 2.749 100
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.py b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.py
new file mode 100644
index 0000000000..f93ec4ba5b
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.py
@@ -0,0 +1,350 @@
+# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template
+
+
+################################################################################
+#
+# Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors
+#
+# This file is a part of the MadGraph5_aMC@NLO project, an application which 
+# automatically generates Feynman diagrams and matrix elements for arbitrary
+# high-energy processes in the Standard Model and beyond.
+#
+# It is subject to the MadGraph5_aMC@NLO license which should accompany this 
+# distribution.
+#
+# For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch
+#
+################################################################################
+from __future__ import division
+from __future__ import absolute_import
+import subprocess
+import unittest
+import os
+import re
+import shutil
+import sys
+import logging
+import time
+import tempfile
+import math
+import madgraph
+
+
+logger = logging.getLogger('test_cmd')
+
+import tests.unit_tests.iolibs.test_file_writers as test_file_writers
+
+import madgraph.interface.master_interface as MGCmd
+import madgraph.interface.madevent_interface as MECmd
+import madgraph.interface.launch_ext_program as launch_ext
+import madgraph.iolibs.files as files
+
+import madgraph.various.misc as misc
+import madgraph.various.lhe_parser as lhe_parser
+import madgraph.various.banner as banner_mod
+import madgraph.various.lhe_parser as lhe_parser
+import madgraph.various.banner as banner
+
+_file_path = os.path.split(os.path.dirname(os.path.realpath(__file__)))[0]
+_pickle_path =os.path.join(_file_path, 'input_files')
+
+from madgraph import MG4DIR, MG5DIR, MadGraph5Error, InvalidCmd
+
+from tests.acceptance_tests.test_cmd_madevent import check_html_page
+pjoin = os.path.join
+
+
+#===============================================================================
+# TestCmd
+#===============================================================================
+class TestCPPfromfile(unittest.TestCase): # inherit from upstream test_cmd_madevent
+    """test that we can launch everything from a single file"""
+
+
+    def setUp(self):
+        
+        self.debuging = unittest.debug
+        if self.debuging:
+            self.path = pjoin(MG5DIR, 'ACC_TEST')
+            if os.path.exists(self.path):
+                 shutil.rmtree(self.path)
+            os.mkdir(self.path) 
+        else:
+            self.path = tempfile.mkdtemp(prefix='acc_test_mg5')
+        self.run_dir = pjoin(self.path, 'MGPROC') 
+        
+    
+    def tearDown(self):
+
+        if not self.debuging:
+            shutil.rmtree(self.path)
+        self.assertFalse(self.debuging)
+
+    def load_result(self, run_name):
+        
+        import madgraph.iolibs.save_load_object as save_load_object
+        import madgraph.madevent.gen_crossxhtml as gen_crossxhtml
+        
+        result = save_load_object.load_from_file(pjoin(self.run_dir,'HTML/results.pkl'))
+        return result[run_name]
+ 
+    def check_parton_output(self, run_name='run_01', target_event=100, cross=0, error=9e99, delta_event=0, html=True):
+        """Check that parton output exists and reach the targert for event"""
+                
+        # check that the number of event is fine:
+        data = self.load_result(run_name)
+        if target_event > 0:
+            if delta_event == 0:
+                self.assertEqual(target_event, int(data[0]['nb_event']))
+            else:
+                self.assertLessEqual(abs(int(data[0]['nb_event'])-target_event), delta_event)
+        self.assertIn('lhe', data[0].parton)
+        
+        if cross:
+            import math
+            new_error = math.sqrt(error**2 + float(data[0]['error'])**2)
+            self.assertLess(
+                abs(cross - float(data[0]['cross']))/new_error,
+                3,
+                'cross is %s and not %s. NB_SIGMA %s' % (float(data[0]['cross']), cross, float(data[0]['cross'])/new_error)
+            )
+            self.assertLess(float(data[0]['error']), 3 * error)
+        if html:
+            check_html_page(self, pjoin(self.run_dir, 'crossx.html'))
+        if 'decayed' not in run_name:
+            check_html_page(self, pjoin(self.run_dir,'HTML', run_name, 'results.html'))
+
+
+
+
+# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template
+    def test_simd_cpp_eemumua_float(self):
+        """e check eemumua in single precision"""
+
+        if logging.getLogger('madgraph').level <= 20:
+            stdout=None
+            stderr=None
+        else:
+            devnull =open(os.devnull,'w')
+            stdout=devnull
+            stderr=devnull
+            
+        try:
+            shutil.rmtree('/tmp/MGPROCESS/')
+        except Exception as error:
+            pass
+        
+        cmd = """#title check eemumua in single precision
+import model sm
+                 set automatic_html_opening False --no_save
+                 set notification_center False --no_save
+                 generate e+ e- > mu+ mu- a
+                 output madevent_simd %s -f -nojpeg
+                 launch
+                 set nevents 100
+                 set floating_type f
+#check run_01 0.0266 0.0002854 100
+
+                 """ %self.run_dir
+
+        open(pjoin(self.path, 'mg5_cmd'),'w').write(cmd)
+        newenv = os.environ.copy()
+        newenv["PYTHONPATH"] = pjoin(MG5DIR, '..')
+        subprocess.call([sys.executable, pjoin(MG5DIR, 'bin','mg5_aMC'),'-m','CUDACPP_OUTPUT', 
+                         pjoin(self.path, 'mg5_cmd')], env=newenv,
+                         #cwd=self.path,
+                         stdout=stdout, stderr=stderr)
+
+        self.check_parton_output(cross=0.0266, error=0.0002854, run_name='run_01', html=True)
+        event = '%s/Events/run_01/unweighted_events.lhe' % self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, 100)
+# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template
+    def test_simd_cpp_heft_ggh_double(self):
+        """e testing a HEFT process gg>aa in double precision"""
+
+        if logging.getLogger('madgraph').level <= 20:
+            stdout=None
+            stderr=None
+        else:
+            devnull =open(os.devnull,'w')
+            stdout=devnull
+            stderr=devnull
+            
+        try:
+            shutil.rmtree('/tmp/MGPROCESS/')
+        except Exception as error:
+            pass
+        
+        cmd = """#title testing a HEFT process gg>aa in double precision
+import model heft
+                 set automatic_html_opening False --no_save
+                 set notification_center False --no_save
+                 generate g g > h > a a
+                 output madevent_simd %s -f -nojpeg
+                 launch
+                 set nevents 100
+                 set floating_type d
+#check run_01 0.01859 0.0002853789088650386 100
+
+                 """ %self.run_dir
+
+        open(pjoin(self.path, 'mg5_cmd'),'w').write(cmd)
+        newenv = os.environ.copy()
+        newenv["PYTHONPATH"] = pjoin(MG5DIR, '..')
+        subprocess.call([sys.executable, pjoin(MG5DIR, 'bin','mg5_aMC'),'-m','CUDACPP_OUTPUT', 
+                         pjoin(self.path, 'mg5_cmd')], env=newenv,
+                         #cwd=self.path,
+                         stdout=stdout, stderr=stderr)
+
+        self.check_parton_output(cross=0.01859, error=0.0002853789088650386, run_name='run_01', html=True)
+        event = '%s/Events/run_01/unweighted_events.lhe' % self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, 100)
+# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template
+    def test_simd_cpp_pptt_mixed(self):
+        """e check ggtt within mixed mode"""
+
+        if logging.getLogger('madgraph').level <= 20:
+            stdout=None
+            stderr=None
+        else:
+            devnull =open(os.devnull,'w')
+            stdout=devnull
+            stderr=devnull
+            
+        try:
+            shutil.rmtree('/tmp/MGPROCESS/')
+        except Exception as error:
+            pass
+        
+        cmd = """#title check ggtt within mixed mode
+import model sm
+                 set automatic_html_opening False --no_save
+                 set notification_center False --no_save
+                 generate p p > t t~
+                 output madevent_simd %s -f -nojpeg
+                 launch
+                 set nevents 100
+                 set floating_type m
+#check run_01 505.5 2.749 100
+
+                 """ %self.run_dir
+
+        open(pjoin(self.path, 'mg5_cmd'),'w').write(cmd)
+        newenv = os.environ.copy()
+        newenv["PYTHONPATH"] = pjoin(MG5DIR, '..')
+        subprocess.call([sys.executable, pjoin(MG5DIR, 'bin','mg5_aMC'),'-m','CUDACPP_OUTPUT', 
+                         pjoin(self.path, 'mg5_cmd')], env=newenv,
+                         #cwd=self.path,
+                         stdout=stdout, stderr=stderr)
+
+        self.check_parton_output(cross=505.5, error=2.749, run_name='run_01', html=True)
+        event = '%s/Events/run_01/unweighted_events.lhe' % self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, 100)
+# WARNING THIS FILE IS AUTOGENERATED -> edit test_simd_madevent.template
+    def test_simd_cpp_vector_size(self):
+        """e: check that multiple vector size returns the same value"""
+
+        if logging.getLogger('madgraph').level <= 20:
+            stdout=None
+            stderr=None
+        else:
+            devnull =open(os.devnull,'w')
+            stdout=devnull
+            stderr=devnull
+            
+        try:
+            shutil.rmtree('/tmp/MGPROCESS/')
+        except Exception as error:
+            pass
+        
+        cmd = """#title: check that multiple vector size returns the same value
+import model sm
+set automatic_html_opening False --no_save
+set notification_center False --no_save
+generate p p > t t~
+output madevent_simd %s -f -nojpeg
+launch  
+set nevents 100
+set floating_type m
+set vector_size 16
+launch
+set vector_size 32
+launch
+set vector_size 64
+#check run_01 505.5 2.749 100
+#check run_02 505.5 2.749 100 
+#check run_03 505.5 2.749 100
+
+                 """ %self.run_dir
+
+        open(pjoin(self.path, 'mg5_cmd'),'w').write(cmd)
+        newenv = os.environ.copy()
+        newenv["PYTHONPATH"] = pjoin(MG5DIR, '..')
+        subprocess.call([sys.executable, pjoin(MG5DIR, 'bin','mg5_aMC'),'-m','CUDACPP_OUTPUT', 
+                         pjoin(self.path, 'mg5_cmd')], env=newenv,
+                         #cwd=self.path,
+                         stdout=stdout, stderr=stderr)
+
+        self.check_parton_output(cross=505.5, error=2.749, run_name='run_01', html=True)
+        event = '%s/Events/run_01/unweighted_events.lhe' % self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, 100)
+        self.check_parton_output(cross=505.5, error=2.749, run_name='run_02', html=False)
+        event = '%s/Events/run_02/unweighted_events.lhe' % self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, 100)
+        self.check_parton_output(cross=505.5, error=2.749, run_name='run_03', html=False)
+        event = '%s/Events/run_03/unweighted_events.lhe' % self.run_dir
+        if not os.path.exists(event):
+            misc.gunzip(event)
+        
+        lhefile = lhe_parser.EventFile(event)
+        nb_event = 0
+        for event in lhe_parser.EventFile(event):
+            event.check()
+            nb_event+=1
+
+        self.assertEqual(nb_event, 100)
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.template b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.template
new file mode 100755
index 0000000000..3b9b8e2c7d
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/acceptance_tests/test_simd_madevent.template
@@ -0,0 +1,115 @@
+################################################################################
+#
+# Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors
+#
+# This file is a part of the MadGraph5_aMC@NLO project, an application which 
+# automatically generates Feynman diagrams and matrix elements for arbitrary
+# high-energy processes in the Standard Model and beyond.
+#
+# It is subject to the MadGraph5_aMC@NLO license which should accompany this 
+# distribution.
+#
+# For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch
+#
+################################################################################
+from __future__ import division
+from __future__ import absolute_import
+import subprocess
+import unittest
+import os
+import re
+import shutil
+import sys
+import logging
+import time
+import tempfile
+import math
+import madgraph
+
+
+logger = logging.getLogger('test_cmd')
+
+import tests.unit_tests.iolibs.test_file_writers as test_file_writers
+
+import madgraph.interface.master_interface as MGCmd
+import madgraph.interface.madevent_interface as MECmd
+import madgraph.interface.launch_ext_program as launch_ext
+import madgraph.iolibs.files as files
+
+import madgraph.various.misc as misc
+import madgraph.various.lhe_parser as lhe_parser
+import madgraph.various.banner as banner_mod
+import madgraph.various.lhe_parser as lhe_parser
+import madgraph.various.banner as banner
+
+_file_path = os.path.split(os.path.dirname(os.path.realpath(__file__)))[0]
+_pickle_path =os.path.join(_file_path, 'input_files')
+
+from madgraph import MG4DIR, MG5DIR, MadGraph5Error, InvalidCmd
+
+from tests.acceptance_tests.test_cmd_madevent import check_html_page
+pjoin = os.path.join
+
+
+#===============================================================================
+# TestCmd
+#===============================================================================
+class TestCPPfromfile(unittest.TestCase): # inherit from upstream test_cmd_madevent
+    """test that we can launch everything from a single file"""
+
+
+    def setUp(self):
+        
+        self.debuging = unittest.debug
+        if self.debuging:
+            self.path = pjoin(MG5DIR, 'ACC_TEST')
+            if os.path.exists(self.path):
+                 shutil.rmtree(self.path)
+            os.mkdir(self.path) 
+        else:
+            self.path = tempfile.mkdtemp(prefix='acc_test_mg5')
+        self.run_dir = pjoin(self.path, 'MGPROC') 
+        
+    
+    def tearDown(self):
+
+        if not self.debuging:
+            shutil.rmtree(self.path)
+        self.assertFalse(self.debuging)
+
+    def load_result(self, run_name):
+        
+        import madgraph.iolibs.save_load_object as save_load_object
+        import madgraph.madevent.gen_crossxhtml as gen_crossxhtml
+        
+        result = save_load_object.load_from_file(pjoin(self.run_dir,'HTML/results.pkl'))
+        return result[run_name]
+ 
+    def check_parton_output(self, run_name='run_01', target_event=100, cross=0, error=9e99, delta_event=0, html=True):
+        """Check that parton output exists and reach the targert for event"""
+                
+        # check that the number of event is fine:
+        data = self.load_result(run_name)
+        if target_event > 0:
+            if delta_event == 0:
+                self.assertEqual(target_event, int(data[0]['nb_event']))
+            else:
+                self.assertLessEqual(abs(int(data[0]['nb_event'])-target_event), delta_event)
+        self.assertIn('lhe', data[0].parton)
+        
+        if cross:
+            import math
+            new_error = math.sqrt(error**2 + float(data[0]['error'])**2)
+            self.assertLess(
+                abs(cross - float(data[0]['cross']))/new_error,
+                3,
+                'cross is %s and not %s. NB_SIGMA %s' % (float(data[0]['cross']), cross, float(data[0]['cross'])/new_error)
+            )
+            self.assertLess(float(data[0]['error']), 3 * error)
+        if html:
+            check_html_page(self, pjoin(self.run_dir, 'crossx.html'))
+        if 'decayed' not in run_name:
+            check_html_page(self, pjoin(self.run_dir,'HTML', run_name, 'results.html'))
+
+
+
diff --git a/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.cu b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.cu
new file mode 100644
index 0000000000..3679e681e1
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.cu
@@ -0,0 +1,11 @@
+! Copyright (C) 2010 The ALOHA Development team and Contributors.
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: O. Mattelaer (Mar 2020) for the MG5aMC CUDACPP plugin.
+! Further modified by: O. Mattelaer, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+
+  //==========================================================================
diff --git a/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
new file mode 100644
index 0000000000..73621b293a
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
@@ -0,0 +1,910 @@
+! Copyright (C) 2010 The ALOHA Development team and Contributors.
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: O. Mattelaer (Mar 2020) for the MG5aMC CUDACPP plugin.
+! Further modified by: D. Massaro, O. Mattelaer, A. Thete, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_INLINE_HELAMPS
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__( ( always_inline ) )
+#else
+#define INLINE
+#define ALWAYS_INLINE
+#endif
+
+  // ALOHA-style object for easy flavor consolidation and non-template API
+  struct ALOHAOBJ {
+
+      static constexpr int np4 = 4;
+      fptype_sv * pvec;
+      fptype * w;
+      int flv_index;
+
+      __host__ __device__ ALOHAOBJ() = default;
+      __host__ __device__ ALOHAOBJ(fptype_sv * pvec_sv, cxtype_sv * w_sv, int flv = -1)
+          : pvec(pvec_sv), w(reinterpret_cast<fptype*>(w_sv)), flv_index(flv) {}
+  };
+
+  struct FLV_COUPLING_VIEW {
+
+      const int* partner1;
+      const int* partner2;
+      const fptype* value;
+
+      __host__ __device__ FLV_COUPLING_VIEW() = default;
+      __host__ __device__
+      FLV_COUPLING_VIEW(const int* partner1_base,
+                        const int* partner2_base,
+                        const fptype* value_base,
+                        const int n)
+      : partner1(partner1_base + n),
+        partner2(partner2_base + n),
+        value(value_base + 2*n) {}
+  };
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  ixxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fi,          // output: aloha objects
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  ipzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavor index
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  imzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle),
+          int flv,                // input: flavor index
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  ixzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavor index
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction vc[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  vxxxxx( const fptype momenta[], // input: momenta
+          const fptype vmass,     // input: vector boson mass
+          const int nhel,         // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson)
+          const int nsv,          // input: +1 (final) or -1 (initial)
+          int flv,                // input: flavor index
+          ALOHAOBJ & vc,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction sc[3] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  sxxxxx( const fptype momenta[], // input: momenta
+          //const fptype,                 // WARNING: input "smass" unused (missing in Fortran) - scalar boson mass
+          //const int,                    // WARNING: input "nhel" unused (missing in Fortran) - scalar has no helicity!
+          const int nss,          // input: +1 (final) or -1 (initial)
+          ALOHAOBJ & sc,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  oxxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavor index
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  opzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavor index
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  omzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavor index
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  oxzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavor index
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //==========================================================================
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ixxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
+    // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
+    // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fi.w );
+    fi.pvec[0] = -pvec0 * (fptype)nsf;
+    fi.pvec[1] = -pvec1 * (fptype)nsf;
+    fi.pvec[2] = -pvec2 * (fptype)nsf;
+    fi.pvec[3] = -pvec3 * (fptype)nsf;
+    fi.flv_index = flv;
+    const int nh = nhel * nsf;
+    if( fmass != 0. )
+    {
+#ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
+      // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
+      // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
+      // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
+      const int ip = ( 1 + nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3+nh)/2 because omega(2) has indexes 1,2
+      const int im = ( 1 - nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3-nh)/2 because omega(2) has indexes 1,2
+#ifndef MGONGPU_CPPSIMD
+      if( pp == 0. )
+      {
+        // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+        fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses
+        //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here?
+        sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here
+        w[0] = cxmake( ip * sqm[ip], 0 );
+        w[1] = cxmake( im * nsf * sqm[ip], 0 );
+        w[2] = cxmake( ip * nsf * sqm[im], 0 );
+        w[3] = cxmake( im * sqm[im], 0 );
+      }
+      else
+      {
+        const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                               fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+        fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. };
+        omega[1] = fmass / omega[0];
+        const fptype sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
+        const fptype pp3 = fpmax( pp + pvec3, 0. );
+        const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ),
+                                ( pp3 == 0. ? cxmake( -nh, 0. ) : cxmake( nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
+        w[0] = sfomega[0] * chi[im];
+        w[1] = sfomega[0] * chi[ip];
+        w[2] = sfomega[1] * chi[im];
+        w[3] = sfomega[1] * chi[ip];
+      }
+#else
+      // Branch A: pp == 0.
+      // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+      fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses (NB: SCALAR!)
+      sqm[1] = ( fmass < 0 ? -sqm[0] : sqm[0] );          // AV: removed an abs here (as above)
+      const cxtype fiA_2 = ip * sqm[ip];                  // scalar cxtype: real part initialised from fptype, imag part = 0
+      const cxtype fiA_3 = im * nsf * sqm[ip];            // scalar cxtype: real part initialised from fptype, imag part = 0
+      const cxtype fiA_4 = ip * nsf * sqm[im];            // scalar cxtype: real part initialised from fptype, imag part = 0
+      const cxtype fiA_5 = im * sqm[im];                  // scalar cxtype: real part initialised from fptype, imag part = 0
+      // Branch B: pp != 0.
+      const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                             fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+      fptype_v omega[2] = { fpsqrt( pvec0 + pp ), 0 };
+      omega[1] = fmass / omega[0];
+      const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
+      const fptype_v pp3 = fpmax( pp + pvec3, 0 );
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
+                                cxternary( ( pp3 == 0. ),
+                                           cxmake( -nh, 0 ),
+                                           cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      const cxtype_v fiB_2 = sfomega[0] * chi[im];
+      const cxtype_v fiB_3 = sfomega[0] * chi[ip];
+      const cxtype_v fiB_4 = sfomega[1] * chi[im];
+      const cxtype_v fiB_5 = sfomega[1] * chi[ip];
+      // Choose between the results from branch A and branch B
+      const bool_v mask = ( pp == 0. );
+      w[0] = cxternary( mask, fiA_2, fiB_2 );
+      w[1] = cxternary( mask, fiA_3, fiB_3 );
+      w[2] = cxternary( mask, fiA_4, fiB_4 );
+      w[3] = cxternary( mask, fiA_5, fiB_5 );
+#endif
+    }
+    else
+    {
+#ifdef MGONGPU_CPPSIMD
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
+                           cxternary( sqp0p3 == 0,
+                                      cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
+                                      cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
+#else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
+      const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+                                 ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
+#endif
+      if( nh == 1 )
+      {
+        w[0] = cxzero_sv();
+        w[1] = cxzero_sv();
+        w[2] = chi[0];
+        w[3] = chi[1];
+      }
+      else
+      {
+        w[0] = chi[1];
+        w[1] = chi[0];
+        w[2] = cxzero_sv();
+        w[3] = cxzero_sv();
+      }
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ipzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fi.w );
+    fi.pvec[0] = -pvec3 * (fptype)nsf;
+    fi.pvec[1] = fptype_sv{ 0 };
+    fi.pvec[2] = fptype_sv{ 0 };
+    fi.pvec[3] = -pvec3 * (fptype)nsf;
+    fi.flv_index = flv;
+    const int nh = nhel * nsf;
+    const cxtype_sv sqp0p3 = cxmake( fpsqrt( 2. * pvec3 ) * (fptype)nsf, 0. );
+    w[0] = cxmake( fi.pvec[1], fi.pvec[2] );
+    if( nh == 1 )
+    {
+      w[1] = cxmake( fi.pvec[1], fi.pvec[2] );
+      w[2] = sqp0p3;
+    }
+    else
+    {
+      w[1] = sqp0p3;
+      w[2] = cxmake( fi.pvec[1], fi.pvec[2] );
+    }
+    w[3] = cxmake( fi.pvec[1], fi.pvec[2] );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  imzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fi.w );
+    fi.pvec[0] =  pvec3 * (fptype)nsf;
+    fi.pvec[1] = fptype_sv{ 0 };
+    fi.pvec[2] = fptype_sv{ 0 };
+    fi.pvec[3] = -pvec3 * (fptype)nsf;
+    fi.flv_index = flv;
+    const int nh = nhel * nsf;
+    const cxtype_sv chi = cxmake( -(fptype)nhel * fpsqrt( -2. * pvec3 ), 0. );
+    w[1] = cxzero_sv();
+    w[2] = cxzero_sv();
+    if( nh == 1 )
+    {
+      w[0] = cxzero_sv();
+      w[3] = chi;
+    }
+    else
+    {
+      w[0] = chi;
+      w[3] = cxzero_sv();
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ixzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fi,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fi.w );
+    fi.pvec[0] = -pvec0 * (fptype)nsf;
+    fi.pvec[1] = -pvec1 * (fptype)nsf;
+    fi.pvec[2] = -pvec2 * (fptype)nsf;
+    fi.pvec[3] = -pvec3 * (fptype)nsf;
+    fi.flv_index = flv;
+    const int nh = nhel * nsf;
+    //const float sqp0p3 = sqrtf( pvec0 + pvec3 ) * nsf; // AV: why force a float here?
+    const fptype_sv sqp0p3 = fpsqrt( pvec0 + pvec3 ) * (fptype)nsf;
+    const cxtype_sv chi0 = cxmake( sqp0p3, 0. );
+    const cxtype_sv chi1 = cxmake( (fptype)nh * pvec1 / sqp0p3, pvec2 / sqp0p3 );
+    if( nh == 1 )
+    {
+      w[0] = cxzero_sv();
+      w[1] = cxzero_sv();
+      w[2] = chi0;
+      w[3] = chi1;
+    }
+    else
+    {
+      w[0] = chi1;
+      w[1] = chi0;
+      w[2] = cxzero_sv();
+      w[3] = cxzero_sv();
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction vc[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  vxxxxx( const fptype momenta[], // input: momenta
+          const fptype vmass,     // input: vector boson mass
+          const int nhel,         // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson)
+          const int nsv,          // input: +1 (final) or -1 (initial)
+          const int flv,          // input: flavour
+          ALOHAOBJ & vc,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
+    // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
+    // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( vc.w );
+    vc.pvec[0] = pvec0 * (fptype)nsv;
+    vc.pvec[1] = pvec1 * (fptype)nsv;
+    vc.pvec[2] = pvec2 * (fptype)nsv;
+    vc.pvec[3] = pvec3 * (fptype)nsv;
+    vc.flv_index = flv;
+    const fptype sqh = fpsqrt( 0.5 ); // AV this is > 0!
+    const fptype hel = nhel;
+    if( vmass != 0. )
+    {
+      const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
+      const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
+      if( pp == 0. )
+      {
+        w[0] = cxmake( 0., 0. );
+        w[1] = cxmake( -hel * sqh, 0. );
+        w[2] = cxmake( 0., nsvahl * sqh );
+        w[3] = cxmake( hel0, 0. );
+      }
+      else
+      {
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
+        w[0] = cxmake( hel0 * pp / vmass, 0. );
+        w[3] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
+        if( pt != 0. )
+        {
+          const fptype pzpt = pvec3 / ( pp * pt ) * sqh * hel;
+          w[1] = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -nsvahl * pvec2 / pt * sqh );
+          w[2] = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, nsvahl * pvec1 / pt * sqh );
+        }
+        else
+        {
+          w[1] = cxmake( -hel * sqh, 0. );
+          // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+          //vc[4] = cxmake( 0., nsvahl * ( pvec3 < 0. ? -std::abs( sqh ) : std::abs( sqh ) ) ); // AV: why abs here?
+          w[2] = cxmake( 0., nsvahl * ( pvec3 < 0. ? -sqh : sqh ) ); // AV: removed an abs here
+        }
+      }
+#else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
+      // Branch A: pp == 0.
+      const cxtype vcA_2 = cxmake( 0, 0 );
+      const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
+      const cxtype vcA_4 = cxmake( 0, nsvahl * sqh );
+      const cxtype vcA_5 = cxmake( hel0, 0 );
+      // Branch B: pp != 0.
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      const fptype_v emp = pvec0 / ( vmass * ppDENOM );         // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      const cxtype_v vcB_2 = cxmake( hel0 * pp / vmass, 0 );
+      const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / ppDENOM * sqh, 0 ); // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      // Branch B1: pp != 0. and pt != 0.
+      volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. );                                                     // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0
+      const fptype_v pzpt = pvec3 / ( ppDENOM * ptDENOM ) * sqh * hel;                                              // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / ptDENOM * sqh );  // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      // Branch B2: pp != 0. and pt == 0.
+      const cxtype vcB2_3 = cxmake( -hel * sqh, 0. );
+      const cxtype_v vcB2_4 = cxmake( 0., (fptype)nsvahl * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
+      // Choose between the results from branch A and branch B (and from branch B1 and branch B2)
+      const bool_v mask = ( pp == 0. );
+      const bool_v maskB = ( pt != 0. );
+      w[0] = cxternary( mask, vcA_2, vcB_2 );
+      w[1] = cxternary( mask, vcA_3, cxternary( maskB, vcB1_3, vcB2_3 ) );
+      w[2] = cxternary( mask, vcA_4, cxternary( maskB, vcB1_4, vcB2_4 ) );
+      w[3] = cxternary( mask, vcA_5, vcB_5 );
+#endif
+    }
+    else
+    {
+      const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
+      const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
+      w[0] = cxzero_sv();
+      w[3] = cxmake( hel * pt / pp * sqh, 0. );
+#ifndef MGONGPU_CPPSIMD
+      if( pt != 0. )
+      {
+        const fptype pzpt = pvec3 / ( pp * pt ) * sqh * hel;
+        w[1] = cxmake( -pvec1 * pzpt, -nsv * pvec2 / pt * sqh );
+        w[2] = cxmake( -pvec2 * pzpt, nsv * pvec1 / pt * sqh );
+      }
+      else
+      {
+        w[1] = cxmake( -hel * sqh, 0. );
+        // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+        //w[2] = cxmake( 0, nsv * ( pvec3 < 0. ? -std::abs( sqh ) : std::abs( sqh ) ) ); // AV why abs here?
+        w[2] = cxmake( 0., nsv * ( pvec3 < 0. ? -sqh : sqh ) ); // AV: removed an abs here
+      }
+#else
+      // Branch A: pt != 0.
+      volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. );                             // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0
+      const fptype_v pzpt = pvec3 / ( pp * ptDENOM ) * sqh * hel;                           // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / ptDENOM * sqh );  // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      // Branch B: pt == 0.
+      const cxtype vcB_3 = cxmake( -(fptype)hel * sqh, 0 );
+      const cxtype_v vcB_4 = cxmake( 0, (fptype)nsv * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
+      // Choose between the results from branch A and branch B
+      const bool_v mask = ( pt != 0. );
+      w[1] = cxternary( mask, vcA_3, vcB_3 );
+      w[2] = cxternary( mask, vcA_4, vcB_4 );
+#endif
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction sc[3] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  sxxxxx( const fptype momenta[], // input: momenta
+          //const fptype,                 // WARNING: input "smass" unused (missing in Fortran) - scalar boson mass
+          //const int,                    // WARNING: input "nhel" unused (missing in Fortran) - scalar has no helicity!
+          const int nss,          // input: +1 (final) or -1 (initial)
+          const int flv,          // input: flavour
+          ALOHAOBJ &sc,           // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( sc.w );
+    sc.pvec[0] = pvec0 * (fptype)nss;
+    sc.pvec[1] = pvec1 * (fptype)nss;
+    sc.pvec[2] = pvec2 * (fptype)nss;
+    sc.pvec[3] = pvec3 * (fptype)nss;
+    sc.flv_index = flv;
+    w[0] = cxmake( 1 + fptype_sv{ 0 }, 0 );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  oxxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          int flv,                // input: flavour
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
+    // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
+    // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fo.w );
+    fo.pvec[0] = pvec0 * (fptype)nsf;
+    fo.pvec[1] = pvec1 * (fptype)nsf;
+    fo.pvec[2] = pvec2 * (fptype)nsf;
+    fo.pvec[3] = pvec3 * (fptype)nsf;
+    fo.flv_index = flv;
+    const int nh = nhel * nsf;
+    if( fmass != 0. )
+    {
+#ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
+      if( pp == 0. )
+      {
+        // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+        fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses
+        //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here?
+        sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here
+        const int ip = -( ( 1 - nh ) / 2 ) * nhel;  // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
+        const int im = ( 1 + nh ) / 2 * nhel;       // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
+        w[0] = cxmake( im * sqm[std::abs( ip )], 0 );
+        w[1] = cxmake( ip * nsf * sqm[std::abs( ip )], 0 );
+        w[2] = cxmake( im * nsf * sqm[std::abs( im )], 0 );
+        w[3] = cxmake( ip * sqm[std::abs( im )], 0 );
+      }
+      else
+      {
+        const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                               fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+        fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. };
+        omega[1] = fmass / omega[0];
+        const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1
+        const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1
+        const fptype sfomeg[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
+        const fptype pp3 = fpmax( pp + pvec3, 0. );
+        const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ),
+                                ( ( pp3 == 0. ) ? cxmake( -nh, 0. )
+                                                : cxmake( nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
+        w[0] = sfomeg[1] * chi[im];
+        w[1] = sfomeg[1] * chi[ip];
+        w[2] = sfomeg[0] * chi[im];
+        w[3] = sfomeg[0] * chi[ip];
+      }
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      // Branch A: pp == 0.
+      // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+      fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
+      sqm[1] = ( fmass < 0 ? -sqm[0] : sqm[0] );          // AV: removed an abs here (as above)
+      const int ipA = -( ( 1 - nh ) / 2 ) * nhel;
+      const int imA = ( 1 + nh ) / 2 * nhel;
+      const cxtype foA_2 = imA * sqm[std::abs( ipA )];
+      const cxtype foA_3 = ipA * nsf * sqm[std::abs( ipA )];
+      const cxtype foA_4 = imA * nsf * sqm[std::abs( imA )];
+      const cxtype foA_5 = ipA * sqm[std::abs( imA )];
+      // Branch B: pp != 0.
+      const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                             fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+      fptype_v omega[2] = { fpsqrt( pvec0 + pp ), 0 };
+      omega[1] = fmass / omega[0];
+      const int ipB = ( 1 + nh ) / 2;
+      const int imB = ( 1 - nh ) / 2;
+      const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
+      const fptype_v pp3 = fpmax( pp + pvec3, 0. );
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
+                                ( cxternary( ( pp3 == 0. ),
+                                             cxmake( -nh, 0. ),
+                                             cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      const cxtype_v foB_2 = sfomeg[1] * chi[imB];
+      const cxtype_v foB_3 = sfomeg[1] * chi[ipB];
+      const cxtype_v foB_4 = sfomeg[0] * chi[imB];
+      const cxtype_v foB_5 = sfomeg[0] * chi[ipB];
+      // Choose between the results from branch A and branch B
+      const bool_v mask = ( pp == 0. );
+      w[0] = cxternary( mask, foA_2, foB_2 );
+      w[1] = cxternary( mask, foA_3, foB_3 );
+      w[2] = cxternary( mask, foA_4, foB_4 );
+      w[3] = cxternary( mask, foA_5, foB_5 );
+#endif
+    }
+    else
+    {
+#ifdef MGONGPU_CPPSIMD
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
+                                cxternary( ( sqp0p3 == 0. ),
+                                           cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
+                                           cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
+#else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
+      const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+                                 ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
+#endif
+      if( nh == 1 )
+      {
+        w[0] = chi[0];
+        w[1] = chi[1];
+        w[2] = cxzero_sv();
+        w[3] = cxzero_sv();
+      }
+      else
+      {
+        w[0] = cxzero_sv();
+        w[1] = cxzero_sv();
+        w[2] = chi[1];
+        w[3] = chi[0];
+      }
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  opzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fo.w );
+    fo.pvec[0] = pvec3 * (fptype)nsf;
+    fo.pvec[1] = fptype_sv{ 0 };
+    fo.pvec[2] = fptype_sv{ 0 };
+    fo.pvec[3] = pvec3 * (fptype)nsf;
+    fo.flv_index = flv;
+    const int nh = nhel * nsf;
+    const cxtype_sv csqp0p3 = cxmake( fpsqrt( 2. * pvec3 ) * (fptype)nsf, 0. );
+    w[1] = cxzero_sv();
+    w[2] = cxzero_sv();
+    if( nh == 1 )
+    {
+      w[0] = csqp0p3;
+      w[3] = cxzero_sv();
+    }
+    else
+    {
+      w[0] = cxzero_sv();
+      w[3] = csqp0p3;
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  omzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fo.w );
+    fo.pvec[0] = -pvec3 * (fptype)nsf;
+    fo.pvec[1] = fptype_sv{ 0 };
+    fo.pvec[2] = fptype_sv{ 0 };
+    fo.pvec[3] = pvec3 * (fptype)nsf;
+    fo.flv_index = flv;
+    const int nh = nhel * nsf;
+    const cxtype_sv chi1 = cxmake( -nhel, 0. ) * fpsqrt( -2. * pvec3 );
+    if( nh == 1 )
+    {
+      w[0] = cxzero_sv();
+      w[1] = chi1;
+      w[2] = cxzero_sv();
+      w[3] = cxzero_sv();
+    }
+    else
+    {
+      w[0] = cxzero_sv();
+      w[1] = cxzero_sv();
+      w[2] = chi1;
+      //w[3] = chi1; // AV: BUG!
+      w[3] = cxzero_sv(); // AV: BUG FIX
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  oxzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          const int flv,          // input: flavour
+          ALOHAOBJ & fo,          // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* w = W_ACCESS::kernelAccess( fo.w );
+    fo.pvec[0] = pvec0 * (fptype)nsf;
+    fo.pvec[1] = pvec1 * (fptype)nsf;
+    fo.pvec[2] = pvec2 * (fptype)nsf;
+    fo.pvec[3] = pvec3 * (fptype)nsf;
+    fo.flv_index = flv;
+    const int nh = nhel * nsf;
+    //const float sqp0p3 = sqrtf( pvec0 + pvec3 ) * nsf; // AV: why force a float here?
+    const fptype_sv sqp0p3 = fpsqrt( pvec0 + pvec3 ) * (fptype)nsf;
+    const cxtype_sv chi0 = cxmake( sqp0p3, 0. );
+    const cxtype_sv chi1 = cxmake( (fptype)nh * pvec1 / sqp0p3, -pvec2 / sqp0p3 );
+    if( nh == 1 )
+    {
+      w[0] = chi0;
+      w[1] = chi1;
+      w[2] = cxzero_sv();
+      w[3] = cxzero_sv();
+    }
+    else
+    {
+      w[0] = cxzero_sv();
+      w[1] = cxzero_sv();
+      w[2] = chi1;
+      w[3] = chi0;
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //==========================================================================
diff --git a/PLUGIN/CUDACPP_OUTPUT/launch_plugin.py b/PLUGIN/CUDACPP_OUTPUT/launch_plugin.py
new file mode 100644
index 0000000000..262d39a736
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/launch_plugin.py
@@ -0,0 +1,140 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
+
+import logging
+import os
+import subprocess
+pjoin = os.path.join
+logger = logging.getLogger('cmdprint') # for stdout
+
+try:
+    import madgraph
+except ImportError:
+    import internal.madevent_interface as madevent_interface
+    import internal.misc as misc
+    import internal.extended_cmd as extended_cmd
+    import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
+else:
+    import madgraph.interface.madevent_interface as madevent_interface
+    import madgraph.various.misc as misc
+    import madgraph.interface.extended_cmd as extended_cmd
+    import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
+
+class CPPMEInterface(madevent_interface.MadEventCmdShell):
+    def compile(self, *args, **opts):
+        """ """
+        import multiprocessing
+        if not self.options['nb_core'] or self.options['nb_core'] == 'None':
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'override FPTYPE': self.run_card['floating_type'] })
+            misc.sprint('FPTYPE checked')
+        cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
+            cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
+            logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
+            if cudacpp_backend in cudacpp_supported_backends :
+                args[0][0] = 'madevent_' + cudacpp_backend + '_link'
+            else:
+                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
+            return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
+        else:
+            return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
+
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: fortran, cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
+class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
+    def reset_simd(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
+        if name == "vector_size" and new_value <= int(old_value):
+            # code can handle the new size -> do not recompile
+            return
+
+        # ok need to force recompilation of the cpp part
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanall'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanall'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+    def default_setup(self):
+        super().default_setup()
+        self.add_param('floating_type', 'm', include=False, hidden=True,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'],
+                       comment='floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)'
+                       )
+        cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
+        self.add_param('cudacpp_backend', 'cpp', include=False, hidden=False,
+                       allowed=cudacpp_supported_backends)
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self['limhel'] = 0
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
+
+    # OM/AV - overload the default version in banner.py
+    def write_one_include_file(self, output_dir, incname, output_file=None):
+        """write one include file at the time"""
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set and 'wrap_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
+
+    def check_validity(self):
+        """ensure that PLUGIN information are consistent"""
+        super().check_validity()
+        if self['SDE_strategy'] != 1:
+            logger.warning('SDE_strategy different of 1 is not supported with SMD/GPU mode')
+            self['sde_strategy'] = 1
+        if self['hel_recycling']:
+            self['hel_recycling'] = False
+
+class GPURunCard(CPPRunCard):
+    def default_setup(self):
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'cuda'
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
+
+MEINTERFACE = CPPMEInterface
+RunCard = CPPRunCard
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/.clang-format b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/.clang-format
new file mode 100644
index 0000000000..0352374f4c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/.clang-format
@@ -0,0 +1,229 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+# --- 
+# February 2022: latest draft for clang 13.0.0 (BasedOnStyle: Google)
+# See https://releases.llvm.org/13.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+---
+Language: Cpp
+BasedOnStyle: Google
+
+AccessModifierOffset: -2 # AV was -1
+AlignAfterOpenBracket: Align # AV ok
+AlignArrayOfStructures: None # AV ok (alternative: Right, but code-generating it would be too complex)
+AlignConsecutiveAssignments: None # AV ok
+AlignConsecutiveBitFields: None # AV ok
+AlignConsecutiveDeclarations: None # AV ok
+AlignConsecutiveMacros: None # AV ok
+AlignEscapedNewlines: DontAlign # AV was Left
+AlignOperands: DontAlign # AV was Align
+AlignTrailingComments: true # AV ok
+AllowAllArgumentsOnNextLine: true # AV ok(?)
+AllowAllConstructorInitializersOnNextLine: true # AV ok (NB: relevant only if ConstructorInitializerAllOnOneLineOrOnePerLine=true)
+AllowAllParametersOfDeclarationOnNextLine: true # AV ok(?)
+AllowShortBlocksOnASingleLine: Always # AV was Never
+AllowShortEnumsOnASingleLine: true # AV ok
+AllowShortCaseLabelsOnASingleLine: true # AV was false
+AllowShortFunctionsOnASingleLine: All # AV ok
+AllowShortLambdasOnASingleLine: All # AV ok
+AllowShortIfStatementsOnASingleLine: WithoutElse # AV ok
+AllowShortLoopsOnASingleLine: true # AV ok
+###AlwaysBreakAfterDefinitionReturnType: None # AV keep defaults (deprecated)
+#AlwaysBreakAfterReturnType: All # AV use this initially, then switch to TopLevelDefinitions!
+AlwaysBreakAfterReturnType: TopLevelDefinitions # AV was None (altearnative: All?)
+AlwaysBreakBeforeMultilineStrings: false # AV was true
+AlwaysBreakTemplateDeclarations: Yes # AV ok
+###AttributeMacros: # AV keep defaults (NB this is not about '__host__' attributes, see llvm/llvm-project/issues/45968)
+###  - __capability
+BinPackArguments: false # AV was true
+BinPackParameters: false # AV was true
+BitFieldColonSpacing: Both # AV ok
+BraceWrapping: # (NB: this is only relevant for "BreakBeforeBraces: Custom")
+  AfterCaseLabel: true # AV was false
+  AfterClass: true # AV was false
+  AfterControlStatement: Always # AV was Never
+  AfterEnum: true # AV was false
+  AfterFunction: true # AV was false
+  AfterNamespace: true # AV was false
+  AfterObjCDeclaration: true # AV was false
+  AfterStruct: true # AV was false
+  AfterUnion: true # AV was false
+  AfterExternBlock: true # AV was false (NB: does not work unless IndentExternBlock is AfterExternBlock?!)
+  BeforeCatch: true # AV was false
+  BeforeElse: true # AV was false
+  BeforeLambdaBody: true # AV was false
+  BeforeWhile: true # AV was false
+  IndentBraces: false # AV ok
+  SplitEmptyFunction: true # AV ok
+  SplitEmptyRecord: true # AV ok
+  SplitEmptyNamespace: true # AV ok
+BreakAfterJavaFieldAnnotations: false
+BreakBeforeBinaryOperators: None # AV ok
+BreakBeforeBraces: Custom # AV was Attach (alternative: Allman)
+BreakBeforeConceptDeclarations: true # AV ok
+###BreakBeforeInheritanceComma: false # (obsolete???)
+BreakBeforeTernaryOperators: true # AV ok
+###BreakConstructorInitializersBeforeComma: true # AV was false (obsolete???)
+BreakConstructorInitializers: BeforeComma # AV was BeforeColon
+BreakInheritanceList: BeforeColon # AV ok (alternative: BeforeComma?)
+BreakStringLiterals: false # AV was true
+ColumnLimit: 0 # AV was 80
+###CommentPragmas: '^[^ ]*' # AV use SpacesInLineCommentPrefix Min=0 Max=1 to allow both "//comment" and "// comment"
+CompactNamespaces: false # AV ok
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 2 # AV was 4
+ContinuationIndentWidth: 2 # AV was 4
+Cpp11BracedListStyle: true # AV ok
+DeriveLineEnding: false # AV was true
+DerivePointerAlignment: false # AV was true
+DisableFormat: false # AV ok
+EmptyLineAfterAccessModifier: Leave # AV was Never
+EmptyLineBeforeAccessModifier: Leave # AV was LogicalBlock
+ExperimentalAutoDetectBinPacking: false # AV ok ("use at your own risk")
+FixNamespaceComments: false # AV was true
+###ForEachMacros: # AV keep defaults
+###  - foreach
+###  - Q_FOREACH
+###  - BOOST_FOREACH
+###IfMacros: # AV keep defaults
+###  - KJ_IF_MAYBE
+IncludeBlocks: Regroup # AV ok
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        4 # AV was 2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^<.*\.h>'
+    Priority:        5 # AV was 1
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^<.*'
+    Priority:        6 # AV was 2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           'mgOnGpuConfig.h'
+    Priority:        1 # AV new
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           'mgOnGpu*.*'
+    Priority:        2 # AV new
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        3 # AV was 3
+    SortPriority:    0
+    CaseSensitive:   false
+###IncludeIsMainRegex: '([-_](test|unittest))?$' # AV keep defaults
+###IncludeIsMainSourceRegex: '' # AV keep defaults
+IndentAccessModifiers: false # AV ok
+IndentCaseLabels: true # AV ok
+IndentCaseBlocks: false # AV ok
+IndentGotoLabels: false # AV was true
+IndentPPDirectives: None # AV ok (NB: AfterHash and BeforeHash do not seem to work as intended)
+###IndentExternBlock: Indent # AV was AfterExternBlock
+IndentExternBlock: AfterExternBlock # AV ok (only with Custom BraceWrapping.AfterExternBlock = true)
+IndentRequires: false # AV ok(?)
+IndentWidth: 2 # AV ok
+IndentWrappedFunctionNames: false # AV ok
+###InsertTrailingCommas: None # AV keep defaults (Java only?)
+###JavaScriptQuotes: Leave # AV irrelevant
+###JavaScriptWrapImports: true # AV irrelevant
+KeepEmptyLinesAtTheStartOfBlocks: false # AV ok
+LambdaBodyIndentation: Signature # AV ok
+###MacroBlockBegin: '' # AV keep defaults
+###MacroBlockEnd: '' # AV keep defaults
+MaxEmptyLinesToKeep: 1 # AV ok
+NamespaceIndentation: All # AV was None
+###ObjCBinPackProtocolList: Never # AV irrelevant
+###ObjCBlockIndentWidth: 2 # AV irrelevant
+###ObjCBreakBeforeNestedBlockParam: true # AV irrelevant
+###ObjCSpaceAfterProperty: false # AV irrelevant
+###ObjCSpaceBeforeProtocolList: true # AV irrelevant
+###PenaltyBreakAssignment: 2 # AV keep defaults
+###PenaltyBreakBeforeFirstCallParameter: 1 # AV keep defaults
+###PenaltyBreakComment: 300 # AV keep defaults
+###PenaltyBreakFirstLessLess: 120 # AV keep defaults
+###PenaltyBreakString: 1000 # AV keep defaults
+###PenaltyBreakTemplateDeclaration: 10 # AV keep defaults
+###PenaltyExcessCharacter: 1000000 # AV keep defaults
+###PenaltyReturnTypeOnItsOwnLine: 200 # AV keep defaults
+###PenaltyIndentedWhitespace: 0 # AV keep defaults
+PointerAlignment: Left # AV ok
+PPIndentWidth: 0 # AV was -1
+###RawStringFormats: # AV keep defaults
+###  - Language: Cpp
+###    Delimiters:
+###      - cc
+###      - CC
+###      - cpp
+###      - Cpp
+###      - CPP
+###      - 'c++'
+###      - 'C++'
+###    CanonicalDelimiter: ''
+###    BasedOnStyle: google
+###  - Language: TextProto
+###    Delimiters:
+###      - pb
+###      - PB
+###      - proto
+###      - PROTO
+###    EnclosingFunctions:
+###      - EqualsProto
+###      - EquivToProto
+###      - PARSE_PARTIAL_TEXT_PROTO
+###      - PARSE_TEST_PROTO
+###      - PARSE_TEXT_PROTO
+###      - ParseTextOrDie
+###      - ParseTextProtoOrDie
+###      - ParseTestProto
+###      - ParsePartialTestProto
+###    CanonicalDelimiter: pb
+###    BasedOnStyle: google
+ReferenceAlignment: Pointer # AV ok
+ReflowComments: false # AV was true
+ShortNamespaceLines: 1 # AV ok
+SortIncludes: CaseSensitive # AV ok
+###SortJavaStaticImport: Before # irrelevant
+SortUsingDeclarations: false # AV was true
+SpaceAfterCStyleCast: false # AV ok
+SpaceAfterLogicalNot: false # AV ok
+SpaceAfterTemplateKeyword: false # AV was true
+SpaceAroundPointerQualifiers: Default # AV ok (alternative: Before?)
+SpaceBeforeAssignmentOperators: true # AV ok
+SpaceBeforeCaseColon: false # AV ok
+SpaceBeforeCpp11BracedList: false # AV ok
+SpaceBeforeCtorInitializerColon: true # AV ok
+SpaceBeforeInheritanceColon: true # AV ok
+SpaceBeforeParens: Never # AV was ControlStatements
+SpaceBeforeRangeBasedForLoopColon: false # AV was true
+SpaceBeforeSquareBrackets: false # AV ok
+SpaceInEmptyBlock: false # AV ok
+SpaceInEmptyParentheses: false # AV ok
+SpacesBeforeTrailingComments: 1 # AV was 2
+SpacesInAngles: Never # AV ok
+SpacesInConditionalStatement: false # AV ok (does this work?)
+SpacesInContainerLiterals: false # AV was true
+SpacesInCStyleCastParentheses: false # AV ok
+SpacesInLineCommentPrefix:
+  Minimum: 0 # AV was 1
+  Maximum: 1 # AV was -1
+SpacesInParentheses: true # AV was false
+SpacesInSquareBrackets: false # AV ok
+Standard: c++17 # AV was Auto
+###StatementAttributeLikeMacros: # AV keep defaults
+###  - Q_EMIT
+###StatementMacros: # AV keep defaults
+###  - Q_UNUSED
+###  - QT_REQUIRE_VERSION
+###TabWidth: 8 # AV irrelevant if UseTab=Never?
+UseCRLF: false # AV ok (but set DeriveLineEnding=false)
+UseTab: Never # AV ok
+###WhitespaceSensitiveMacros: # AV keep defaults
+###  - STRINGIZE
+###  - PP_STRINGIZE
+###  - BOOST_PP_STRINGIZE
+###  - NS_SWIFT_NAME
+###  - CF_SWIFT_NAME
+...
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/AUTHORS
new file mode 100644
index 0000000000..a9887177ea
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/AUTHORS
@@ -0,0 +1,34 @@
+Copyright (C) 2020-2025 CERN and UCLouvain.
+Licensed under the GNU Lesser General Public License (version 3 or later).
+All rights not expressly granted are reserved.
+
+# Authors
+
+The development team of the MG5aMC CUDACPP plugin and of the code that it
+generates includes the following authors:
+
+  Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
+  Olivier Mattelaer (Universite Catholique de Louvain, original author)
+  Stefan Roiser (CERN, original author)
+  Jorgen Teig (CERN)
+  Andrea Valassi (CERN, original author)
+  Zenny Wettersten (CERN)
+
+# Collaborators
+
+The development team of the MG5aMC CUDACPP plugin and of the code that it
+generates benefitted significantly from the much appreciated and thankfully
+acknowledged collaboration with the following collaborators:
+
+  Tyler J. Burch (Argonne National Laboratory)
+  Taylor Childers (Argonne National Laboratory)
+  Laurence Field (CERN)
+  Walter Hopkins (Argonne National Laboratory)
+  Nathan S. Nichols (Argonne National Laboratory)
+  Filip Optolowicz (CERN)
+  Andreas Reepschlaeger (CERN)
+  Taran Singhania (PES University Bangalore)
+  David Smith (CERN)
+  Carl Vuosalo (University of Wisconsin-Madison)
+
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/CMakeLists.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/CMakeLists.txt
new file mode 100644
index 0000000000..ae8222f087
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+# Minimal CMake configuration to build a functional CPU version
+
+cmake_minimum_required(VERSION 3.22)
+
+project(Madgraph4GPU)
+
+include(${PROJECT_SOURCE_DIR}/CMake/Platforms.txt)
+include(${PROJECT_SOURCE_DIR}/CMake/Compilers.txt)
+include(${PROJECT_SOURCE_DIR}/CMake/Macros.txt)
+
+set(PROJECT_GITROOT_DIR ${PROJECT_SOURCE_DIR}/../../..)
+
+add_subdirectory(src)
+add_subdirectory(SubProcesses)
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Compilers.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Compilers.txt
new file mode 100644
index 0000000000..52f4b1286f
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Compilers.txt
@@ -0,0 +1,7 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Macros.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Macros.txt
new file mode 100644
index 0000000000..b6df33ba5b
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Macros.txt
@@ -0,0 +1,15 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+MACRO(SUBDIRLIST result)
+  FILE(GLOB children RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/*)
+  SET(dirlist "")
+  FOREACH(child ${children})
+    IF(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${child})
+      LIST(APPEND dirlist ${child})
+    ENDIF()
+  ENDFOREACH()
+  SET(${result} ${dirlist})
+ENDMACRO()
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Platforms.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Platforms.txt
new file mode 100644
index 0000000000..f2a67f8e7f
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/Platforms.txt
@@ -0,0 +1,8 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+if (CMAKE_HOST_APPLE)
+  add_definitions(-DMGONGPU_HAS_NO_CURAND)
+endif(CMAKE_HOST_APPLE)
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists.txt
new file mode 100644
index 0000000000..86634c5a28
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+SUBDIRLIST(SUBDIRS)
+FOREACH(subdir ${SUBDIRS})
+  ADD_SUBDIRECTORY(${subdir})
+ENDFOREACH()
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists_P.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists_P.txt
new file mode 100644
index 0000000000..c91dac301c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/SubProcesses/CMakeLists_P.txt
@@ -0,0 +1,29 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+get_filename_component(basename ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+string(TOLOWER ${basename} targadd)
+
+file(GLOB_RECURSE HEADERS "../*.h" CPPProcess.h)
+set(SOURCES ../BridgeKernels.cc CPPProcess.cc ../CrossSectionKernels.cc
+            ../MatrixElementKernels.cc ../RamboSamplingKernels.cc
+            ../RandomNumberKernels.cc)
+
+set(libname mg5amc_cxx_${targadd})
+add_library(${libname} ${SOURCES} ${HEADERS})
+target_include_directories(${libname} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
+                                             "${PROJECT_SOURCE_DIR}/src"
+                                             "${PROJECT_GITROOT_DIR}/tools")
+
+set(execname check_${targadd}.exe)
+add_executable(${execname} check_sa.cc)
+target_link_libraries(${execname} PUBLIC mg5amc_common ${libname})
+target_include_directories(${execname} PRIVATE "${PROJECT_SOURCE_DIR}/src")
+
+# some XCode specific stuff to make the executable run
+set_property(TARGET ${libname} PROPERTY XCODE_GENERATE_SCHEME TRUE)
+set_property(TARGET ${execname} PROPERTY XCODE_GENERATE_SCHEME TRUE)
+set_property(TARGET ${execname} PROPERTY XCODE_SCHEME_ARGUMENTS "--bridge" "8" "8" "32")
+set_property(TARGET ${execname} PROPERTY XCODE_SCHEME_WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/src/CMakeLists.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/src/CMakeLists.txt
new file mode 100644
index 0000000000..c952f113c5
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/CMake/src/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2022-2024) for the MG5aMC CUDACPP plugin.
+
+file(GLOB_RECURSE HEADERS "*.h")
+add_library(mg5amc_common Parameters.cc read_slha.cc ${HEADERS})
+
+# some XCode specific stuff to make the executable run
+set_property(TARGET mg5amc_common PROPERTY XCODE_GENERATE_SCHEME TRUE)
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING
new file mode 100644
index 0000000000..f288702d2f
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING.LESSER b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING.LESSER
new file mode 100644
index 0000000000..0a041280bd
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYING.LESSER
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
new file mode 100644
index 0000000000..d5f6746559
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT
@@ -0,0 +1,58 @@
+Copyright (C) 2020-2025 CERN and UCLouvain.
+Licensed under the GNU Lesser General Public License (version 3 or later).
+All rights not expressly granted are reserved.
+
+The copyright and license notice above cover the CUDACPP code-generating plugin
+of the MadGraph5_aMC@NLO (in the following "MG5aMC") software, and all code
+generated using that plugin. These are collectively referred to as "this work"
+or "the MG5aMC CUDACPP plugin and the code that it generates", or more simply
+as "the MG5aMC CUDACPP plugin", in the following and throughout this work.
+
+The MG5aMC CUDACPP plugin and the code that it generates are based on the
+initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on
+CPUs using vectorized C++ by three original authors from CERN and UCLouvain.
+The full development team currently includes the following authors :
+  Stephan Hageboeck (CERN)
+  Daniele Massaro (CERN)
+  Olivier Mattelaer (Universite Catholique de Louvain, original author)
+  Stefan Roiser (CERN, original author)
+  Jorgen Teig (CERN)
+  Andrea Valassi (CERN, original author)
+  Zenny Wettersten (CERN)
+See https://github.com/madgraph5/madgraph4gpu for more details. For the full
+list of authors and collaborators of this work, see the file "AUTHORS" in the
+same directory as this "COPYRIGHT" file in the source code of the plugin.
+
+The MG5aMC CUDACPP plugin and the code that it generates are derived from, and
+are intended to be used in combination with, the MG5aMC software and the code
+that it generates. The MG5aMC software is developed by the MadGraph5_aMC@NLO
+development team and contributors, also known as the "MadTeam", who are the
+owners of its copyright and have licensed it as specified in
+https://github.com/mg5amcnlo/mg5amcnlo/blob/main/madgraph/LICENSE.
+For the full list of authors and contributors of the MG5aMC software, see
+https://github.com/mg5amcnlo/mg5amcnlo/blob/main/madgraph/AUTHORS.
+
+The MG5aMC CUDACPP plugin and the code that it generates are free software;
+you can redistribute them and/or modify them under the terms of the GNU Lesser
+General Public License as published by the Free Software Foundation, either
+version 3 or (at your option) any later version.
+
+This work is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
+
+The GNU Lesser General Public License (LGPL) version 3 is copied verbatim in
+the file "COPYING.LESSER" in the same directory as this "COPYRIGHT" file. It is
+also available at <https://www.gnu.org/licenses/lgpl-3.0.txt>.
+
+This version of the GNU Lesser General Public License incorporates the terms
+and conditions of version 3 of the GNU General Public License (GPL), which is
+copied verbatim in the file "COPYING" in the same directory as this "COPYRIGHT"
+file and is also available at <https://www.gnu.org/licenses/gpl-3.0.txt>.
+
+In line with the license above, the authors emphasise the following points. For
+the developers' and authors' protection, the GPL clearly explains that there is
+no warranty for this free software. For both users' and authors' sake, the GPL
+requires that modified versions be marked as changed, so that their problems
+will not be attributed erroneously to authors of previous versions.
+
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
new file mode 100644
index 0000000000..baebfe3aab
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -0,0 +1,100 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+%(info_lines)s
+//==========================================================================
+
+#include "Parameters.h"
+
+#include <iomanip>
+#include <iostream>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+#ifndef MGONGPU_HARDCODE_PARAM
+
+// Initialize static instance
+Parameters* Parameters::instance = 0;
+
+// Function to get static instance - only one instance per program
+Parameters*
+Parameters::getInstance()
+{
+  if( instance == 0 )
+    instance = new Parameters();
+  return instance;
+}
+
+void
+Parameters::setIndependentParameters( SLHAReader& slha )
+{
+  zero = 0;                         // define "zero"
+  ZERO = 0;                         // define "zero"
+  std::vector<int> indices( 2, 0 ); // prepare a vector for indices
+  %(set_independent_parameters)s
+}
+
+void
+Parameters::setIndependentCouplings()
+{
+  %(set_independent_couplings)s
+  %(set_flv_couplings)s
+}
+
+/*
+void
+Parameters::setDependentParameters() // now computed event-by-event (running alphas #373)
+{
+  %(set_dependent_parameters)s
+}
+
+void
+Parameters::setDependentCouplings() // now computed event-by-event (running alphas #373)
+{
+  %(set_dependent_couplings)s
+}
+*/
+
+#endif
+
+// Routines for printing out parameters
+void
+Parameters::printIndependentParameters()
+{
+  std::cout << "model parameters independent of event kinematics:" << std::endl;
+  std::cout << "(Warning: aS in the runcard is ignored because event-by-event Gs are hardcoded or retrieved from Fortran)" << std::endl;
+  %(print_independent_parameters)s
+}
+
+void
+Parameters::printIndependentCouplings()
+{
+  std::cout << "model couplings independent of event kinematics:" << std::endl;
+  %(print_independent_couplings)s
+}
+
+/*
+void
+Parameters::printDependentParameters() // now computed event-by-event (running alphas #373)
+{
+  std::cout << "model parameters dependent on event kinematics:" << std::endl;
+  %(print_dependent_parameters)s
+}
+
+void
+Parameters::printDependentCouplings() // now computed event-by-event (running alphas #373)
+{
+  std::cout << "model couplings dependent on event kinematics:" << std::endl;
+  %(print_dependent_couplings)s
+}
+*/
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
new file mode 100644
index 0000000000..5da52d140c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -0,0 +1,260 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+%(info_lines)s
+//==========================================================================
+
+#ifndef Parameters_H
+#define Parameters_H
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+#include "mgOnGpuVectors.h"
+
+#include "constexpr_math.h"
+
+//==========================================================================
+
+// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0
+// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated
+// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0)
+%(bsmdefine)s
+
+#ifndef MGONGPU_HARDCODE_PARAM%(eftwarn0)s
+
+#include "read_slha.h"
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  // Define FLV_COUPLING struct
+  struct FLV_COUPLING {
+    static constexpr int max_flavor = %(max_flavor)d;
+
+    int partner1[max_flavor];
+    int partner2[max_flavor];
+    cxtype* value[max_flavor]; // because it will refer already calculated couplings
+
+    FLV_COUPLING() {
+      for (int i = 0; i < max_flavor; ++i) {
+          partner1[i] = -1;
+          partner2[i] = -1;
+      }
+    }
+  };
+
+  class Parameters
+  {
+  public:
+
+    static Parameters* getInstance();
+
+    // Define "zero"
+    double zero, ZERO;
+
+    %(independent_parameters)s
+
+    %(independent_couplings)s
+
+    %(dependent_parameters)s
+
+    %(dependent_couplings)s
+
+    %(flavor_independent_couplings)s
+
+    %(flavor_dependent_couplings)s
+
+    // Set parameters that are unchanged during the run
+    void setIndependentParameters( SLHAReader& slha );
+
+    // Set couplings that are unchanged during the run
+    void setIndependentCouplings();
+
+    // Set parameters that are changed event by event
+    //void setDependentParameters(); // now computed event-by-event (running alphas #373)
+
+    // Set couplings that are changed event by event
+    //void setDependentCouplings(); // now computed event-by-event (running alphas #373)
+
+    // Print parameters that are unchanged during the run
+    void printIndependentParameters();
+
+    // Print couplings that are unchanged during the run
+    void printIndependentCouplings();
+
+    // Print parameters that are changed event by event
+    //void printDependentParameters(); // now computed event-by-event (running alphas #373)
+
+    // Print couplings that are changed event by event
+    //void printDependentCouplings(); // now computed event-by-event (running alphas #373)
+
+    // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings;
+    static constexpr int nBsmIndepParam = %(nbsmip)i;
+    %(hasbsmip)sdouble mdl_bsmIndepParam[nBsmIndepParam];
+
+  private:
+
+    static Parameters* instance;
+  };
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#else%(eftwarn1)s
+
+#include <cassert>
+#include <limits>
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  // Hardcoded constexpr physics parameters
+  namespace Parameters // keep the same name rather than HardcodedParameters for simplicity
+  {
+    // Model parameters independent of aS
+    constexpr double zero = 0;
+    constexpr double ZERO = 0;
+    %(hardcoded_independent_parameters)s
+    // Model couplings independent of aS
+    %(hardcoded_independent_couplings)s
+    // Model parameters dependent on aS
+    %(hardcoded_dependent_parameters)s
+    // Model couplings dependent on aS
+    %(hardcoded_dependent_couplings)s
+    // Print parameters that are unchanged during the run
+    void printIndependentParameters();
+
+    // Print couplings that are unchanged during the run
+    void printIndependentCouplings();
+
+    // Print parameters that are changed event by event
+    //void printDependentParameters(); // now computed event-by-event (running alphas #373)
+
+    // Print couplings that are changed event by event
+    //void printDependentCouplings(); // now computed event-by-event (running alphas #373)
+
+    // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings;
+    constexpr int nBsmIndepParam = %(nbsmip)i;
+    %(hasbsmip)s__device__ constexpr double mdl_bsmIndepParam[nBsmIndepParam] = { %(bsmip)s };
+  }
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif
+
+//==========================================================================
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  namespace Parameters_dependentCouplings
+  {
+    constexpr size_t ndcoup = %(ndcoup)d; // #couplings that vary event by event because they depend on the running alphas QCD
+%(idcoup)s
+    struct DependentCouplings_sv
+    {
+%(dcoupdecl)s
+    };
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"        // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
+#pragma GCC diagnostic ignored "-Wunused-variable"         // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘mdl_G__exp__2’ set but not used [-Wunused-but-set-variable]>>
+#ifdef MGONGPUCPP_GPUIMPL
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
+#endif
+    __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr )
+    {
+#ifdef MGONGPU_HARDCODE_PARAM
+      using namespace Parameters;
+#else%(eftspecial0)s
+#endif
+      // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters) because:
+      // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below
+      const cxtype cI( 0., 1. );
+      DependentCouplings_sv out;
+#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )
+      // Couplings are (scalar, or vector of) doubles, or scalar floats - default implementation
+      {
+        const fptype_sv& G = G_sv;
+        // Model parameters dependent on aS
+%(dcoupsetdpar)s
+        // Model couplings dependent on aS
+%(dcoupsetdcoup)s
+      }
+#else
+      // Couplings are VECTORS OF FLOATS: #439 special handling is needed (variable Gs are vector floats, fixed parameters are scalar doubles)
+      // Use an explicit loop to avoid <<error: conversion of scalar ‘double’ to vector ‘fptype_sv’ {aka ‘__vector(8) float’} involves truncation>>
+      // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters)
+      // (NB in pure SM processes this special handling is not needed, but we keep it here for simplicity, see PR #824)%(dcoupoutfptypev2)s
+      for( int i = 0; i < neppV; i++ )
+      {
+        const fptype& G = G_sv[i];
+        // Model parameters dependent on aS
+%(dcoupsetdpar2)s
+        // Model couplings dependent on aS
+  %(dcoupsetdcoup2)s
+      }%(dcoupoutdcoup2)s
+#endif
+      return out;
+    }
+#ifdef MGONGPUCPP_GPUIMPL
+#pragma GCC diagnostic pop
+#pragma nv_diagnostic pop
+#endif
+  }
+
+  //==========================================================================
+
+  namespace Parameters_independentCouplings
+  {
+    constexpr size_t nicoup = %(nicoup)d; // #couplings that are fixed for all events because they do not depend on the running alphas QCD
+%(iicoup)s
+  }
+
+  //==========================================================================
+
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
+#endif
+  // Compute the output couplings (e.g. gc10 and gc11) from the input gs
+  template<class G_ACCESS, class C_ACCESS>
+  __device__ inline void
+  G2COUP( const fptype gs[],
+          fptype couplings[],
+          const double* bsmIndepParamPtr )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    using namespace Parameters_dependentCouplings;
+    const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
+    DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
+%(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+#pragma GCC diagnostic pop
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+//==========================================================================
+
+#endif // Parameters_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
new file mode 100644
index 0000000000..4b90d4e8e1
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -0,0 +1,633 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, S. Roiser, J. Teig, A. Thete, A. Valassi, Z. Wettersten
+// (2021-2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef BRIDGE_H
+#define BRIDGE_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"           // for CPPProcess
+#include "CrossSectionKernels.h"  // for flagAbnormalMEs
+#include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice
+#include "MemoryAccessMomenta.h"  // for MemoryAccessMomenta::neppM
+#include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
+
+//#ifdef __HIPCC__
+//#include <experimental/filesystem> // see
+//https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 #else #include
+//<filesystem> // bypass this completely to ease portability on LUMI #803 #endif
+
+#include <sys/stat.h> // bypass std::filesystem #803
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+  /**
+ * A base class for a class whose pointer is passed between Fortran and C++.
+ * This is not really necessary, but it allows minimal type checks on all such
+ * pointers.
+ */
+  struct CppObjectInFortran
+  {
+    CppObjectInFortran() {}
+    virtual ~CppObjectInFortran() {}
+  };
+
+  //--------------------------------------------------------------------------
+  /**
+ * A templated class for calling the CUDA/C++ matrix element calculations of the
+ * event generation workflow. The FORTRANFPTYPE template parameter indicates the
+ * precision of the Fortran momenta from MadEvent (float or double). The
+ * precision of the matrix element calculation is hardcoded in the fptype
+ * typedef in CUDA/C++.
+ *
+ * The Fortran momenta passed in are in the form of
+ *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+ * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>,
+ * <nevtF(#events)>. In memory, this is stored in a way that C reads as an array
+ * P_MULTI[nevtF][nparF][np4F]. The CUDA/C++ momenta are stored as an
+ * array[npagM][npar][np4][neppM] with nevt=npagM*neppM. The Bridge is
+ * configured to store nevt==nevtF events in CUDA/C++. It also checks that
+ * Fortran and C++ parameters match, nparF==npar and np4F==np4.
+ *
+ * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+ * This allows mixing double in MadEvent Fortran with float in CUDA/C++
+ * sigmaKin. In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use
+ * double or float. In the check_sa "--bridge" test, everything is implemented
+ * in fptype (double or float).
+ */
+  template<typename FORTRANFPTYPE>
+  class Bridge final : public CppObjectInFortran
+  {
+  public:
+    /**
+   * Constructor
+   *
+   * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array
+   * loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+   * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in
+   * Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F number of momenta components, usually 4, in Fortran arrays
+   * (KEPT FOR SANITY CHECKS ONLY)
+   */
+    Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
+
+    /**
+   * Destructor
+   */
+    virtual ~Bridge() {}
+
+    // Delete copy/move constructors and assignment operators
+    Bridge( const Bridge& ) = delete;
+    Bridge( Bridge&& ) = delete;
+    Bridge& operator=( const Bridge& ) = delete;
+    Bridge& operator=( Bridge&& ) = delete;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    /**
+   * Set the gpublocks and gputhreads for the gpusequence - throws if evnt !=
+   * gpublocks*gputhreads (this is needed for BridgeKernel tests rather than for
+   * actual production use in Fortran)
+   *
+   * @param gpublocks number of gpublocks
+   * @param gputhreads number of gputhreads
+   */
+    void set_gpugrid( const int gpublocks, const int gputhreads );
+
+    /**
+   * Sequence to be executed for the Cuda matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param iflavorVec the index of the flavor combination
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const unsigned int* iflavorVec, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
+#else
+    /**
+   * Sequence to be executed for the vectorized CPU matrix element calculation
+   *
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant
+   * alphas)
+   * @param iflavorVec the index of the flavor combination
+   * @param rndhel the pointer to the input random numbers for helicity
+   * selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1
+   * to n
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+    void cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, const unsigned int* iflavorVec, const FORTRANFPTYPE* rndhel, const FORTRANFPTYPE* rndcol, const unsigned int* channelIds, FORTRANFPTYPE* mes, int* selhel, int* selcol, const bool goodHelOnly = false );
+#endif
+
+    // Return the number of good helicities (-1 initially when they have not yet
+    // been calculated)
+    int nGoodHel() const { return m_nGoodHel; }
+
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge
+    // interface to Fortran)
+    constexpr int nTotHel() const { return CPPProcess::ncomb; }
+
+  private:
+    unsigned int m_nevt; // number of events
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have
+                         // not yet been calculated)
+
+#ifdef MGONGPUCPP_GPUIMPL
+    int m_gputhreads; // number of gpu threads (default set from number of
+                      // events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events,
+                      // can be modified)
+    DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
+    DeviceBufferMomenta m_devMomentaC;
+    DeviceBufferGs m_devGs;
+    DeviceBufferIflavorVec m_devIflavorVec;
+    DeviceBufferRndNumHelicity m_devRndHel;
+    DeviceBufferRndNumColor m_devRndCol;
+    DeviceBufferMatrixElements m_devMEs;
+    DeviceBufferSelectedHelicity m_devSelHel;
+    DeviceBufferSelectedColor m_devSelCol;
+    DeviceBufferChannelIds m_devChannelIds;
+    PinnedHostBufferIflavorVec m_hstIflavorVec;
+    PinnedHostBufferGs m_hstGs;
+    PinnedHostBufferRndNumHelicity m_hstRndHel;
+    PinnedHostBufferRndNumColor m_hstRndCol;
+    PinnedHostBufferMatrixElements m_hstMEs;
+    PinnedHostBufferSelectedHelicity m_hstSelHel;
+    PinnedHostBufferSelectedColor m_hstSelCol;
+    PinnedHostBufferChannelIds m_hstChannelIds;
+    std::unique_ptr<MatrixElementKernelDevice> m_pmek;
+    // static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads
+    // (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin =
+      32; // minimum number of gpu threads (DEFAULT)
+#else
+    HostBufferMomenta m_hstMomentaC;
+    HostBufferGs m_hstGs;
+    HostBufferIflavorVec m_hstIflavorVec;
+    HostBufferRndNumHelicity m_hstRndHel;
+    HostBufferRndNumColor m_hstRndCol;
+    HostBufferMatrixElements m_hstMEs;
+    HostBufferSelectedHelicity m_hstSelHel;
+    HostBufferSelectedColor m_hstSelCol;
+    HostBufferChannelIds m_hstChannelIds;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+#endif
+  };
+
+  //--------------------------------------------------------------------------
+  //
+  // Forward declare transposition methods
+  //
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  template<typename Tin, typename Tout>
+  __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaC2F( const Tin* in, Tout* out, const unsigned int nevt );
+
+  //--------------------------------------------------------------------------
+  //
+  // Implementations of member functions of class Bridge
+  //
+
+  template<typename FORTRANFPTYPE>
+  Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F)
+    : m_nevt( nevtF ), m_nGoodHel( -1 )
+#ifdef MGONGPUCPP_GPUIMPL
+    , m_gputhreads( 256 )                  // default number of gpu threads
+    , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
+    , m_devMomentaF( m_nevt )
+    , m_devMomentaC( m_nevt )
+    , m_devIflavorVec( m_nevt )
+    , m_devGs( m_nevt )
+    , m_devRndHel( m_nevt )
+    , m_devRndCol( m_nevt )
+    , m_devMEs( m_nevt )
+    , m_devSelHel( m_nevt )
+    , m_devSelCol( m_nevt )
+    , m_devChannelIds( m_nevt )
+#else
+    , m_hstMomentaC( m_nevt )
+#endif
+    , m_hstGs( m_nevt )
+    , m_hstIflavorVec( m_nevt )
+    , m_hstRndHel( m_nevt )
+    , m_hstRndCol( m_nevt )
+    , m_hstMEs( m_nevt )
+    , m_hstSelHel( m_nevt )
+    , m_hstSelCol( m_nevt )
+    , m_hstChannelIds( m_nevt )
+    , m_pmek( nullptr )
+  {
+    if( nparF != CPPProcess::npar )
+      throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 )
+      throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+#ifdef MGONGPUCPP_GPUIMPL
+    if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
+      throw std::runtime_error(
+        "Bridge constructor: nevt should be a multiple of " +
+        std::to_string( s_gputhreadsmin ) );
+    while( m_nevt != m_gpublocks * m_gputhreads )
+    {
+      m_gputhreads /= 2;
+      if( m_gputhreads < s_gputhreadsmin )
+        throw std::logic_error(
+          "Bridge constructor: FIXME! cannot choose gputhreads" ); // this
+                                                                   // should
+                                                                   // never
+                                                                   // happen!
+      m_gpublocks = m_nevt / m_gputhreads;
+    }
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelDevice(
+      m_devMomentaC, m_devGs, m_devIflavorVec, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads) );
+#else
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
+              << std::endl;
+#endif
+    m_pmek.reset( new MatrixElementKernelHost(
+      m_hstMomentaC, m_hstGs, m_hstIflavorVec, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPUCPP_GPUIMPL
+    // Create a process object, read param card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only
+    // needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate
+    // is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    std::string paramCard =
+      "../Cards/param_card.dat"; // ZW: change default param_card.dat location
+                                 // to one dir down
+    /*
+#ifdef __HIPCC__
+  if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" +
+paramCard; #endif
+  */
+    // struct stat dummybuffer; // bypass std::filesystem #803
+    // if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" +
+    // paramCard; //
+    auto fileExists = []( std::string& fileName )
+    {
+      struct stat buffer;
+      return stat( fileName.c_str(), &buffer ) == 0;
+    };
+    size_t paramCardCheck = 2; // ZW: check for paramCard up to 2 directories up
+    for( size_t k = 0; k < paramCardCheck; ++k )
+    {
+      if( fileExists( paramCard ) ) break; // bypass std::filesystem #803
+      paramCard = "../" + paramCard;
+    }
+    process.initProc( paramCard );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  template<typename FORTRANFPTYPE>
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks,
+                                           const int gputhreads )
+  {
+    if( m_nevt != gpublocks * gputhreads )
+      throw std::runtime_error(
+        "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+    m_gpublocks = gpublocks;
+    m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt
+              << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")"
+              << std::endl;
+#endif
+    m_pmek->setGrid( m_gpublocks, m_gputhreads );
+  }
+#endif
+
+#ifdef MGONGPUCPP_GPUIMPL
+  template<typename FORTRANFPTYPE>
+  void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
+                                            const FORTRANFPTYPE* gs,
+                                            const unsigned int* iflavorVec,
+                                            const FORTRANFPTYPE* rndhel,
+                                            const FORTRANFPTYPE* rndcol,
+                                            const unsigned int* channelIds,
+                                            FORTRANFPTYPE* mes,
+                                            int* selhel,
+                                            int* selcol,
+                                            const bool goodHelOnly )
+  {
+    constexpr int neppM = MemoryAccessMomenta::neppM;
+    if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice );
+    }
+    else
+    {
+      gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice );
+      const int thrPerEvt =
+        CPPProcess::npar *
+        CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1
+                         // event per thread)
+      // const int thrPerEvt = 1; // AV: try new alg with 1 event per thread...
+      // this seems slower
+      gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+    }
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) );
+    }
+    else
+    {
+      std::copy( gs, gs + m_nevt, m_hstGs.data() );
+      std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
+      std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
+    }
+    const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated with
+    // gpuMallocHost and NOT initialized in PinnedHostBufferBase, but it is NOT
+    // used later on
+    // initialise iflavorVec
+    memcpy( m_hstIflavorVec.data(), iflavorVec, m_nevt * sizeof( unsigned int ) );
+    copyDeviceFromHost( m_devGs, m_hstGs );
+    copyDeviceFromHost( m_devRndHel, m_hstRndHel );
+    copyDeviceFromHost( m_devRndCol, m_hstRndCol );
+    if( useChannelIds ) copyDeviceFromHost( m_devChannelIds, m_hstChannelIds );
+    copyDeviceFromHost( m_devIflavorVec, m_hstIflavorVec );
+    if( m_nGoodHel < 0 )
+    {
+      m_nGoodHel = m_pmek->computeGoodHelicities();
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+    }
+    if( goodHelOnly ) return;
+    m_pmek->computeMatrixElements( useChannelIds );
+    copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
+    flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
+    copyHostFromDevice( m_hstSelHel, m_devSelHel );
+    copyHostFromDevice( m_hstSelCol, m_devSelCol );
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
+      memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() );
+      memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() );
+    }
+    else
+    {
+      std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
+      std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
+      std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
+    }
+  }
+#endif
+
+#ifndef MGONGPUCPP_GPUIMPL
+  template<typename FORTRANFPTYPE>
+  void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
+                                            const FORTRANFPTYPE* gs,
+                                            const unsigned int* iflavorVec,
+                                            const FORTRANFPTYPE* rndhel,
+                                            const FORTRANFPTYPE* rndcol,
+                                            const unsigned int* channelIds,
+                                            FORTRANFPTYPE* mes,
+                                            int* selhel,
+                                            int* selcol,
+                                            const bool goodHelOnly )
+  {
+    hst_transposeMomentaF2C( momenta, m_hstMomentaC.data(), m_nevt );
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) );
+    }
+    else
+    {
+      std::copy( gs, gs + m_nevt, m_hstGs.data() );
+      std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
+      std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
+    }
+    const bool useChannelIds = ( channelIds != nullptr ) && ( !goodHelOnly );
+    if( useChannelIds )
+      memcpy( m_hstChannelIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    // else ... // no need to initialize m_hstChannel: it is allocated and default
+    // initialized in HostBufferBase (and it is not used later on anyway)
+    // initialise iflavorVec
+    memcpy( m_hstIflavorVec.data(), iflavorVec, m_nevt * sizeof( unsigned int ) );
+    if( m_nGoodHel < 0 )
+    {
+      m_nGoodHel = m_pmek->computeGoodHelicities();
+      if( m_nGoodHel < 0 )
+        throw std::runtime_error(
+          "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+    }
+    if( goodHelOnly ) return;
+    m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
+    flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
+      memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() );
+      memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() );
+    }
+    else
+    {
+      std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
+      std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
+      std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+  //
+  // Implementations of transposition methods
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==>
+  // p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
+  //
+
+#ifdef MGONGPUCPP_GPUIMPL
+  template<typename Tin, typename Tout>
+  __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool oldImplementation = true; // default: use old implementation
+    if constexpr( oldImplementation )
+    {
+      // SR initial implementation
+      constexpr int part = CPPProcess::npar;
+      constexpr int mome = CPPProcess::np4;
+      constexpr int strd = MemoryAccessMomenta::neppM;
+      int pos = blockDim.x * blockIdx.x + threadIdx.x;
+      int arrlen = nevt * part * mome;
+      if( pos < arrlen )
+      {
+        int page_i = pos / ( strd * mome * part );
+        int rest_1 = pos % ( strd * mome * part );
+        int part_i = rest_1 / ( strd * mome );
+        int rest_2 = rest_1 % ( strd * mome );
+        int mome_i = rest_2 / strd;
+        int strd_i = rest_2 % strd;
+        int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                  // event size (pos of event)
+          + part_i * mome                      // particle inside event
+          + mome_i;                            // momentum inside particle
+        out[pos] = in[inpos];                  // F2C (Fortran to C)
+      }
+    }
+    else
+    {
+      // AV attempt another implementation with 1 event per thread: this seems
+      // slower... F-style: AOS[nevtF][nparF][np4F] C-style:
+      // AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      constexpr int npar = CPPProcess::npar;
+      constexpr int np4 = CPPProcess::np4;
+      constexpr int neppM = MemoryAccessMomenta::neppM;
+      assert( nevt % neppM ==
+              0 ); // number of events is not a multiple of neppM???
+      int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      int ipagM = ievt / neppM;
+      int ieppM = ievt % neppM;
+      for( int ip4 = 0; ip4 < np4; ip4++ )
+        for( int ipar = 0; ipar < npar; ipar++ )
+        {
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM +
+            ip4 * neppM + ieppM;
+          int fpos = ievt * npar * np4 + ipar * np4 + ip4;
+          out[cpos] = in[fpos]; // F2C (Fortran to C)
+        }
+    }
+  }
+#endif
+
+  template<typename Tin, typename Tout, bool F2C>
+  void hst_transposeMomenta( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool oldImplementation = false; // default: use new implementation
+    if constexpr( oldImplementation )
+    {
+      // SR initial implementation
+      constexpr unsigned int part = CPPProcess::npar;
+      constexpr unsigned int mome = CPPProcess::np4;
+      constexpr unsigned int strd = MemoryAccessMomenta::neppM;
+      unsigned int arrlen = nevt * part * mome;
+      for( unsigned int pos = 0; pos < arrlen; ++pos )
+      {
+        unsigned int page_i = pos / ( strd * mome * part );
+        unsigned int rest_1 = pos % ( strd * mome * part );
+        unsigned int part_i = rest_1 / ( strd * mome );
+        unsigned int rest_2 = rest_1 % ( strd * mome );
+        unsigned int mome_i = rest_2 / strd;
+        unsigned int strd_i = rest_2 % strd;
+        unsigned int inpos = ( page_i * strd + strd_i ) // event number
+            * ( part * mome )                           // event size (pos of event)
+          + part_i * mome                               // particle inside event
+          + mome_i;                                     // momentum inside particle
+        if constexpr( F2C )                             // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];                         // F2C (Fortran to C)
+        else
+          out[inpos] = in[pos]; // C2F (C to Fortran)
+      }
+    }
+    else
+    {
+      // AV attempt another implementation: this is slightly faster (better c++
+      // pipelining?) [NB! this is not a transposition, it is an AOS to AOSOA
+      // conversion: if neppM=1, a memcpy is enough] F-style:
+      // AOS[nevtF][nparF][np4F] C-style: AOSOA[npagM][npar][np4][neppM] with
+      // nevt=npagM*neppM
+      constexpr unsigned int npar = CPPProcess::npar;
+      constexpr unsigned int np4 = CPPProcess::np4;
+      constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
+      if constexpr( neppM == 1 && std::is_same_v<Tin, Tout> )
+      {
+        memcpy( out, in, nevt * npar * np4 * sizeof( Tin ) );
+      }
+      else
+      {
+        const unsigned int npagM = nevt / neppM;
+        assert( nevt % neppM ==
+                0 ); // number of events is not a multiple of neppM???
+        for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
+          for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
+            for( unsigned int ipar = 0; ipar < npar; ipar++ )
+              for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
+              {
+                unsigned int ievt = ipagM * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM +
+                  ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
+                if constexpr( F2C )
+                  out[cpos] = in[fpos]; // F2C (Fortran to C)
+                else
+                  out[fpos] = in[cpos]; // C2F (C to Fortran)
+              }
+      }
+    }
+  }
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool F2C = true;
+    hst_transposeMomenta<Tin, Tout, F2C>( in, out, nevt );
+  }
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaC2F( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool F2C = false;
+    hst_transposeMomenta<Tin, Tout, F2C>( in, out, nevt );
+  }
+
+  //--------------------------------------------------------------------------
+} // namespace mg5amcGpu
+#endif // BRIDGE_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
new file mode 100644
index 0000000000..cb5d56b3cc
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc
@@ -0,0 +1,163 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, J. Teig, A. Thete, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#include "BridgeKernels.h"
+
+#include "GpuAbstraction.h"
+#include "MemoryAccessMomenta.h"
+
+#include <sstream>
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int np4 = CPPProcess::np4;   // dimensions of 4-momenta (E,px,py,pz)
+  constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+
+  //--------------------------------------------------------------------------
+
+  BridgeKernelBase::BridgeKernelBase( const BufferMomenta& momenta,         // input: momenta
+                                      const BufferGs& gs,                   // input: gs for alphaS
+                                      const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                      const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                      BufferMatrixElements& matrixElements, // output: matrix elements
+                                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                                      BufferSelectedColor& selcol,          // output: color selection
+                                      const size_t nevt)
+    : MatrixElementKernelBase( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_bridge( nevt, npar, np4 )
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "BridgeKernelBase: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "BridgeKernelBase: matrixElements must be a host array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "BridgeKernelBase: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "BridgeKernelBase: nevt mismatch with matrixElements" );
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+//============================================================================
+
+#ifndef MGONGPUCPP_GPUIMPL
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  BridgeKernelHost::BridgeKernelHost( const BufferMomenta& momenta,         // input: momenta
+                                      const BufferGs& gs,                   // input: Gs for alphaS
+                                      const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                      const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                      BufferMatrixElements& matrixElements, // output: matrix elements
+                                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                                      BufferSelectedColor& selcol,          // output: color selection
+                                      const size_t nevt)
+    : BridgeKernelBase( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, matrixElements, selhel, selcol, nevt)
+    , m_fortranMomenta( nevt )
+  {
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelHost::transposeInputMomentaC2F()
+  {
+    hst_transposeMomentaC2F( m_momenta.data(), m_fortranMomenta.data(), nevt() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int BridgeKernelHost::computeGoodHelicities()
+  {
+    constexpr bool goodHelOnly = true;
+    constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering
+    m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+    return m_bridge.nGoodHel();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelHost::computeMatrixElements( const bool useChannelIds )
+  {
+    constexpr bool goodHelOnly = false;
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+
+  //--------------------------------------------------------------------------
+
+  BridgeKernelDevice::BridgeKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                                          const BufferGs& gs,                   // input: Gs for alphaS
+                                          const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                                          const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                          const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                          const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                          BufferMatrixElements& matrixElements, // output: matrix elements
+                                          BufferSelectedHelicity& selhel,       // output: helicity selection
+                                          BufferSelectedColor& selcol,          // output: color selection
+                                          const size_t gpublocks,
+                                          const size_t gputhreads)
+    : BridgeKernelBase( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, matrixElements, selhel, selcol, gpublocks * gputhreads)
+    , m_fortranMomenta( nevt() )
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if( m_gpublocks == 0 ) throw std::runtime_error( "BridgeKernelDevice: gpublocks must be > 0" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "BridgeKernelDevice: gputhreads must be > 0" );
+    m_bridge.set_gpugrid( gpublocks, gputhreads );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelDevice::transposeInputMomentaC2F()
+  {
+    hst_transposeMomentaC2F( m_momenta.data(), m_fortranMomenta.data(), nevt() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int BridgeKernelDevice::computeGoodHelicities()
+  {
+    constexpr bool goodHelOnly = true;
+    constexpr unsigned int* pChannelIds = nullptr; // disable multi-channel for helicity filtering
+    m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+    return m_bridge.nGoodHel();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelDevice::computeMatrixElements( const bool useChannelIds )
+  {
+    constexpr bool goodHelOnly = false;
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
new file mode 100644
index 0000000000..3d94970dd7
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h
@@ -0,0 +1,145 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, J. Teig, A. Thete, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef BRIDGEKERNELS_H
+#define BRIDGEKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "Bridge.h"
+#include "MatrixElementKernels.h"
+#include "MemoryBuffers.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // A Bridge wrapper base class encapsulating matrix element calculations on a CPU host
+  class BridgeKernelBase : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    BridgeKernelBase( const BufferMomenta& momenta,         // input: momenta
+                      const BufferGs& gs,                   // input: gs for alphaS
+                      const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                      const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                      BufferMatrixElements& matrixElements, // output: matrix elements
+                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                      BufferSelectedColor& selcol,          // output: color selection
+                      const size_t nevt);
+
+    // Destructor
+    virtual ~BridgeKernelBase() {}
+
+    // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge
+    virtual void transposeInputMomentaC2F() = 0;
+
+  protected:
+
+    // The wrapped bridge
+    Bridge<fptype> m_bridge;
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
+  class BridgeKernelHost final : public BridgeKernelBase
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    BridgeKernelHost( const BufferMomenta& momenta,         // input: momenta
+                      const BufferGs& gs,                   // input: gs for alphaS
+                      const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                      const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                      BufferMatrixElements& matrixElements, // output: matrix elements
+                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                      BufferSelectedColor& selcol,          // output: color selection
+                      const size_t nevt);
+
+    // Destructor
+    virtual ~BridgeKernelHost() {}
+
+    // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge
+    void transposeInputMomentaC2F() override final;
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+  private:
+
+    // The buffer for the input momenta, transposed to Fortran array indexing
+    HostBufferMomenta m_fortranMomenta;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
+  class BridgeKernelDevice : public BridgeKernelBase
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    BridgeKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                        const BufferGs& gs,                   // input: gs for alphaS
+                        const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                        const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                        const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                        const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                        BufferMatrixElements& matrixElements, // output: matrix elements
+                        BufferSelectedHelicity& selhel,       // output: helicity selection
+                        BufferSelectedColor& selcol,          // output: color selection
+                        const size_t gpublocks,
+                        const size_t gputhreads);
+
+    // Destructor
+    virtual ~BridgeKernelDevice() {}
+
+    // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge
+    void transposeInputMomentaC2F() override final;
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The buffer for the input momenta, transposed to Fortran array indexing
+    PinnedHostBufferMomenta m_fortranMomenta;
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // BRIDGEKERNELS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
new file mode 100644
index 0000000000..89092fbc38
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc
@@ -0,0 +1,38 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#include "CommonRandomNumbers.h"
+#include "GpuAbstraction.h"
+#include "MemoryBuffers.h"
+#include "RandomNumberKernels.h"
+
+#include <cassert>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  CommonRandomNumberKernel::CommonRandomNumberKernel( BufferRndNumMomenta& rnarray )
+    : RandomNumberKernelBase( rnarray )
+    , m_seed( 20211220 )
+  {
+    if( m_rnarray.isOnDevice() )
+      throw std::runtime_error( "CommonRandomNumberKernel on host with a device random number array" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CommonRandomNumberKernel::generateRnarray()
+  {
+    std::vector<double> rnd = CommonRandomNumbers::generate<double>( m_rnarray.size(), m_seed ); // NB: generate as double (HARDCODED)
+    std::copy( rnd.begin(), rnd.end(), m_rnarray.data() );                                       // NB: copy may imply a double-to-float conversion
+  }
+
+  //--------------------------------------------------------------------------
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumbers.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumbers.h
new file mode 100644
index 0000000000..0cbd979310
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumbers.h
@@ -0,0 +1,96 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef COMMONRANDOMNUMBERS_H_
+#define COMMONRANDOMNUMBERS_H_ 1
+
+#include <future>
+#include <random>
+#include <thread>
+#include <vector>
+
+namespace CommonRandomNumbers
+{
+
+  /// Create `n` random numbers using simple c++ engine.
+  template<typename T>
+  std::vector<T> generate( std::size_t n, std::minstd_rand::result_type seed = 1337 )
+  {
+    std::vector<T> result;
+    result.reserve( n );
+
+    std::minstd_rand generator( seed );
+    std::uniform_real_distribution<T> distribution( 0.0, 1.0 );
+
+    for( std::size_t i = 0; i < n; ++i )
+    {
+      result.push_back( distribution( generator ) );
+    }
+
+    return result;
+  }
+
+  /// Create `nBlock` blocks of random numbers.
+  /// Each block uses a generator that's seeded with `seed + blockIndex`, and blocks are generated in parallel.
+  template<typename T>
+  std::vector<std::vector<T>> generateParallel( std::size_t nPerBlock, std::size_t nBlock, std::minstd_rand::result_type seed = 1337 )
+  {
+    std::vector<std::vector<T>> results( nBlock );
+    std::vector<std::thread> threads;
+    const auto partPerThread = nBlock / std::thread::hardware_concurrency() + ( nBlock % std::thread::hardware_concurrency() != 0 );
+
+    auto makeBlock = [nPerBlock, nBlock, seed, &results]( std::size_t partitionBegin, std::size_t partitionEnd )
+    {
+      for( std::size_t partition = partitionBegin; partition < partitionEnd && partition < nBlock; ++partition )
+      {
+        results[partition] = generate<T>( nPerBlock, seed + partition );
+      }
+    };
+
+    for( unsigned int threadId = 0; threadId < std::thread::hardware_concurrency(); ++threadId )
+    {
+      threads.emplace_back( makeBlock, threadId * partPerThread, ( threadId + 1 ) * partPerThread );
+    }
+
+    for( auto& thread: threads )
+    {
+      thread.join();
+    }
+
+    return results;
+  }
+
+  /// Starts asynchronous generation of random numbers. This uses as many threads as cores, and generates blocks of random numbers.
+  /// These become available at unspecified times, but the blocks 0, 1, 2, ... are generated first.
+  /// Each block is seeded with seed + blockIndex to generate stable sequences.
+  /// \param[in/out] promises Vector of promise objects storing blocks of random numbers.
+  /// \param[in] nPerBlock Configures number of entries generated per block.
+  /// \param[in] nBlock Configures the number of blocks generated.
+  /// \param[in] nThread Optional concurrency.
+  /// \param[in] seed Optional seed.
+  template<typename T>
+  void startGenerateAsync( std::vector<std::promise<std::vector<T>>>& promises, std::size_t nPerBlock, std::size_t nBlock, unsigned int nThread = std::thread::hardware_concurrency(), std::minstd_rand::result_type seed = 1337 )
+  {
+    promises.resize( nBlock );
+    std::vector<std::thread> threads;
+
+    auto makeBlocks = [=, &promises]( std::size_t threadID )
+    {
+      for( std::size_t partition = threadID; partition < nBlock; partition += nThread )
+      {
+        auto values = generate<T>( nPerBlock, seed + partition );
+        promises[partition].set_value( std::move( values ) );
+      }
+    };
+
+    for( unsigned int threadId = 0; threadId < nThread; ++threadId )
+    {
+      std::thread( makeBlocks, threadId ).detach();
+    }
+  }
+
+}
+
+#endif /* COMMONRANDOMNUMBERS_H_ */
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
new file mode 100644
index 0000000000..bb1e49e3a7
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc
@@ -0,0 +1,237 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#include "CrossSectionKernels.h"
+
+#include "GpuAbstraction.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessWeights.h"
+#include "MemoryBuffers.h"
+
+#include <sstream>
+
+// ******************************************************************************************
+// *** NB: Disabling fast math is essential here, otherwise results are undefined         ***
+// *** NB: This file CrossSectionKernels.cc IS BUILT WITH -fno-fast-math in the Makefile! ***
+// *** NB: Attempts with __attribute__((optimize("-fno-fast-math"))) were unsatisfactory  ***
+// ******************************************************************************************
+
+inline bool
+fp_is_nan( const fptype& fp )
+{
+  //#pragma clang diagnostic push
+  //#pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
+  return std::isnan( fp ); // always false for clang in fast math mode (tautological compare)?
+  //#pragma clang diagnostic pop
+}
+
+inline bool
+fp_is_abnormal( const fptype& fp )
+{
+  if( fp_is_nan( fp ) ) return true;
+  if( fp != fp ) return true;
+  return false;
+}
+
+inline bool
+fp_is_zero( const fptype& fp )
+{
+  if( fp == 0 ) return true;
+  return false;
+}
+
+// See https://en.cppreference.com/w/cpp/numeric/math/FP_categories
+inline const char*
+fp_show_class( const fptype& fp )
+{
+  switch( std::fpclassify( fp ) )
+  {
+    case FP_INFINITE: return "Inf";
+    case FP_NAN: return "NaN";
+    case FP_NORMAL: return "normal";
+    case FP_SUBNORMAL: return "subnormal";
+    case FP_ZERO: return "zero";
+    default: return "unknown";
+  }
+}
+
+inline void
+debug_me_is_abnormal( const fptype& me, size_t ievtALL )
+{
+  std::cout << "DEBUG[" << ievtALL << "]"
+            << " ME=" << me
+            << " fpisabnormal=" << fp_is_abnormal( me )
+            << " fpclass=" << fp_show_class( me )
+            << " (me==me)=" << ( me == me )
+            << " (me==me+1)=" << ( me == me + 1 )
+            << " isnan=" << fp_is_nan( me )
+            << " isfinite=" << std::isfinite( me )
+            << " isnormal=" << std::isnormal( me )
+            << " is0=" << ( me == 0 )
+            << " is1=" << ( me == 1 )
+            << " abs(ME)=" << std::abs( me )
+            << " isnan=" << fp_is_nan( std::abs( me ) )
+            << std::endl;
+}
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  void flagAbnormalMEs( fptype* hstMEs, unsigned int nevt )
+  {
+    for( unsigned int ievt = 0; ievt < nevt; ievt++ )
+    {
+      if( fp_is_abnormal( hstMEs[ievt] ) )
+      {
+        std::cout << "WARNING! flagging abnormal ME for ievt=" << ievt << std::endl;
+        hstMEs[ievt] = std::sqrt( -1. );
+      }
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  CrossSectionKernelHost::CrossSectionKernelHost( const BufferWeights& samplingWeights,       // input: sampling weights
+                                                  const BufferMatrixElements& matrixElements, // input: matrix elements
+                                                  EventStatistics& stats,                     // output: event statistics
+                                                  const size_t nevt )
+    : CrossSectionKernelBase( samplingWeights, matrixElements, stats )
+    , NumberOfEvents( nevt )
+  {
+    if( m_samplingWeights.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelHost: samplingWeights must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelHost: matrixElements must be a host array" );
+    if( this->nevt() != m_samplingWeights.nevt() ) throw std::runtime_error( "CrossSectionKernelHost: nevt mismatch with samplingWeights" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "CrossSectionKernelHost: nevt mismatch with matrixElements" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CrossSectionKernelHost::updateEventStatistics( const bool debug )
+  {
+    EventStatistics stats; // new statistics for the new nevt events
+    // FIRST PASS: COUNT ALL/ABN/ZERO EVENTS, COMPUTE MIN/MAX, COMPUTE REFS AS MEANS OF SIMPLE SUMS
+    for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration
+    {
+      const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt );
+      const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt );
+      const size_t ievtALL = m_iter * nevt() + ievt;
+      // The following events are abnormal in a run with "-p 2048 256 12 -d"
+      // - check.exe/commonrand: ME[310744,451171,3007871,3163868,4471038,5473927] with fast math
+      // - check.exe/curand: ME[578162,1725762,2163579,5407629,5435532,6014690] with fast math
+      // - gcheck.exe/curand: ME[596016,1446938] with fast math
+      // Debug NaN/abnormal issues
+      //if ( ievtALL == 310744 ) // this ME is abnormal both with and without fast math
+      //  debug_me_is_abnormal( me, ievtALL );
+      //if ( ievtALL == 5473927 ) // this ME is abnormal only with fast math
+      //  debug_me_is_abnormal( me, ievtALL );
+      stats.nevtALL++;
+      if( fp_is_abnormal( me ) )
+      {
+        if( debug ) // only printed out with "-p -d" (matrixelementALL is not filled without -p)
+          std::cout << "WARNING! ME[" << ievtALL << "] is NaN/abnormal" << std::endl;
+        stats.nevtABN++;
+        continue;
+      }
+      if( fp_is_zero( me ) ) stats.nevtZERO++;
+      stats.minME = std::min( stats.minME, (double)me );
+      stats.maxME = std::max( stats.maxME, (double)me );
+      stats.minWG = std::min( stats.minWG, (double)wg );
+      stats.maxWG = std::max( stats.maxWG, (double)wg );
+      stats.sumMEdiff += me; // NB stats.refME is 0 here
+      stats.sumWGdiff += wg; // NB stats.refWG is 0 here
+    }
+    stats.refME = stats.meanME(); // draft ref
+    stats.refWG = stats.meanWG(); // draft ref
+    stats.sumMEdiff = 0;
+    stats.sumWGdiff = 0;
+    // SECOND PASS: IMPROVE MEANS FROM SUMS OF DIFFS TO PREVIOUS REF, UPDATE REF
+    for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration
+    {
+      const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt );
+      const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt );
+      if( fp_is_abnormal( me ) ) continue;
+      stats.sumMEdiff += ( me - stats.refME );
+      stats.sumWGdiff += ( wg - stats.refWG );
+    }
+    stats.refME = stats.meanME(); // final ref
+    stats.refWG = stats.meanWG(); // final ref
+    stats.sumMEdiff = 0;
+    stats.sumWGdiff = 0;
+    // THIRD PASS: COMPUTE STDDEV FROM SQUARED SUMS OF DIFFS TO REF
+    for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration
+    {
+      const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt );
+      const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt );
+      if( fp_is_abnormal( me ) ) continue;
+      stats.sqsMEdiff += std::pow( me - stats.refME, 2 );
+      stats.sqsWGdiff += std::pow( wg - stats.refWG, 2 );
+    }
+    // FOURTH PASS: UPDATE THE OVERALL STATS BY ADDING THE NEW STATS
+    m_stats += stats;
+    // Increment the iterations counter
+    m_iter++;
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+
+  /*
+  //--------------------------------------------------------------------------
+
+  CrossSectionKernelDevice::CrossSectionKernelDevice( const BufferWeights& samplingWeights,       // input: sampling weights
+                                                      const BufferMatrixElements& matrixElements, // input: matrix elements
+                                                      EventStatistics& stats,                     // output: event statistics
+                                                      const size_t gpublocks,
+                                                      const size_t gputhreads )
+    : CrossSectionKernelBase( samplingWeights, matrixElements, stats )
+    , NumberOfEvents( gpublocks*gputhreads )
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if ( ! m_samplingWeights.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelDevice: samplingWeights must be a device array" );
+    if ( ! m_matrixElements.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelDevice: matrixElements must be a device array" );
+    if ( m_gpublocks == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gpublocks must be > 0" );
+    if ( m_gputhreads == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gputhreads must be > 0" );
+    if ( this->nevt() != m_samplingWeights.nevt() ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch with samplingWeights" );
+    if ( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch with matrixElements" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CrossSectionKernelDevice::setGrid( const size_t gpublocks, const size_t gputhreads )
+  {
+    if ( m_gpublocks == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gpublocks must be > 0 in setGrid" );
+    if ( m_gputhreads == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gputhreads must be > 0 in setGrid" );
+    if ( this->nevt() != m_gpublocks * m_gputhreads ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch in setGrid" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CrossSectionKernelDevice::updateEventStatistics( const bool debug )
+  {
+    // Increment the iterations counter
+    m_iter++;
+  }
+
+  //--------------------------------------------------------------------------
+  */
+
+}
+#endif
+
+//============================================================================
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
new file mode 100644
index 0000000000..f3267643f4
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h
@@ -0,0 +1,138 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef CROSSSECTIONKERNELS_H
+#define CROSSSECTIONKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "EventStatistics.h"
+#include "MemoryBuffers.h"
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // Helper function for Bridge.h: must be compiled without fast math
+  // Iterate through all output MEs and replace any NaN/abnormal ones by sqrt(-1)
+  void flagAbnormalMEs( fptype* hstMEs, unsigned int nevt );
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating the calculation of event statistics on a CPU host or on a GPU device
+  class CrossSectionKernelBase //: virtual public ICrossSectionKernel
+  {
+  protected:
+
+    // Constructor from existing input and output buffers
+    CrossSectionKernelBase( const BufferWeights& samplingWeights,       // input: sampling weights
+                            const BufferMatrixElements& matrixElements, // input: matrix elements
+                            EventStatistics& stats )                    // output: event statistics
+      : m_samplingWeights( samplingWeights )
+      , m_matrixElements( matrixElements )
+      , m_stats( stats )
+      , m_iter( 0 )
+    {
+      // NB: do not initialise EventStatistics (you may be asked to update an existing result)
+    }
+
+  public:
+
+    // Destructor
+    virtual ~CrossSectionKernelBase() {}
+
+    // Update event statistics
+    virtual void updateEventStatistics( const bool debug = false ) = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The buffer for the sampling weights
+    const BufferWeights& m_samplingWeights;
+
+    // The buffer for the output matrix elements
+    const BufferMatrixElements& m_matrixElements;
+
+    // The event statistics
+    EventStatistics& m_stats;
+
+    // The number of iterations processed so far
+    size_t m_iter;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating the calculation of event statistics on a CPU host
+  class CrossSectionKernelHost final : public CrossSectionKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    CrossSectionKernelHost( const BufferWeights& samplingWeights,       // input: sampling weights
+                            const BufferMatrixElements& matrixElements, // input: matrix elements
+                            EventStatistics& stats,                     // output: event statistics
+                            const size_t nevt );
+
+    // Destructor
+    virtual ~CrossSectionKernelHost() {}
+
+    // Update event statistics
+    void updateEventStatistics( const bool debug = false ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+  };
+
+  //--------------------------------------------------------------------------
+
+  /*
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating the calculation of event statistics on a GPU device
+  class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    CrossSectionKernelDevice( const BufferWeights& samplingWeights,       // input: sampling weights
+                              const BufferMatrixElements& matrixElements, // input: matrix elements
+                              EventStatistics& stats,                     // output: event statistics
+                              const size_t gpublocks,
+                              const size_t gputhreads );
+
+    // Destructor
+    virtual ~CrossSectionKernelDevice(){}
+
+    // Reset gpublocks and gputhreads
+    void setGrid( const size_t gpublocks, const size_t gputhreads );
+
+    // Update event statistics
+    void updateEventStatistics( const bool debug=false ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+
+  };
+#endif
+  */
+
+  //--------------------------------------------------------------------------
+}
+#endif // CROSSSECTIONKERNELS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
new file mode 100644
index 0000000000..da07aa3a17
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc
@@ -0,0 +1,135 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#include "GpuRuntime.h"
+#include "MemoryBuffers.h"
+#include "RandomNumberKernels.h"
+
+#include <cassert>
+
+#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined
+#include "curand.h"
+#define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
+inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != CURAND_STATUS_SUCCESS )
+  {
+    printf( "CurandAssert: %s:%d code=%d\n", file, line, code );
+    if ( abort ) assert( code == CURAND_STATUS_SUCCESS );
+  }
+}
+#endif /* clang-format on */
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+#ifndef MGONGPU_HAS_NO_CURAND
+  CurandRandomNumberKernel::CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice )
+    : RandomNumberKernelBase( rnarray )
+    , m_isOnDevice( onDevice )
+  {
+    if( m_isOnDevice )
+    {
+#ifdef MGONGPUCPP_GPUIMPL
+      if( !m_rnarray.isOnDevice() )
+        throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
+#else
+      throw std::runtime_error( "CurandRandomNumberKernel does not support CurandDevice on CPU host" );
+#endif
+    }
+    else
+    {
+      if( m_rnarray.isOnDevice() )
+        throw std::runtime_error( "CurandRandomNumberKernel on host with a device random number array" );
+    }
+    createGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  CurandRandomNumberKernel::~CurandRandomNumberKernel()
+  {
+    destroyGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::seedGenerator( const unsigned int seed )
+  {
+    if( m_isOnDevice )
+    {
+      destroyGenerator(); // workaround for #429
+      createGenerator();  // workaround for #429
+    }
+    //printf( "seedGenerator: seed %d\n", seed );
+    checkCurand( curandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::createGenerator()
+  {
+    // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s]
+    const curandRngType_t type = CURAND_RNG_PSEUDO_MTGP32; //          0.00082s | 0.00064s (FOR FAST TESTS)
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_XORWOW;        // 0.049s   | 0.0016s
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_MRG32K3A;      // 0.71s    | 0.0012s  (better but slower, especially in c++)
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_MT19937;       // 21s      | 0.021s
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_PHILOX4_32_10; // 0.024s   | 0.00026s (used to segfault?)
+    if( m_isOnDevice )
+    {
+      checkCurand( curandCreateGenerator( &m_rnGen, type ) );
+    }
+    else
+    {
+      checkCurand( curandCreateGeneratorHost( &m_rnGen, type ) );
+    }
+    //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_LEGACY ) ); // fails with code=104 (see #429)
+    checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_BEST ) );
+    //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_DYNAMIC ) ); // fails with code=104 (see #429)
+    //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_SEEDED ) ); // fails with code=104 (see #429)
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::destroyGenerator()
+  {
+    checkCurand( curandDestroyGenerator( m_rnGen ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::generateRnarray()
+  {
+#if defined MGONGPU_FPTYPE_DOUBLE
+    checkCurand( curandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#elif defined MGONGPU_FPTYPE_FLOAT
+    checkCurand( curandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#endif
+    /*
+    printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
+    fptype* data = m_rnarray.data();
+#ifdef MGONGPUCPP_GPUIMPL
+    if( m_rnarray.isOnDevice() )
+    {
+      data = new fptype[m_rnarray.size()]();
+      checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) );
+    }
+#endif
+    for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
+      printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
+#ifdef MGONGPUCPP_GPUIMPL
+    if( m_rnarray.isOnDevice() ) delete[] data;
+#endif
+    */
+  }
+
+  //--------------------------------------------------------------------------
+#endif
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
new file mode 100644
index 0000000000..0857275ae4
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef EventStatistics_H
+#define EventStatistics_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h" // for npar (meGeVexponent)
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // The EventStatistics struct is used to accumulate running aggregates of event statistics.
+  // This will eventually include the process cross section and the process maximum weight:
+  // one important case of EventStatistics will then be the "gridpack" result set, which is
+  // the output of the "integration" step and the input to "unweighted event generation" step.
+  // The current implementation only includes statistics for matrix elements (ME) and sampling weights (WG);
+  // in first approximation, the process cross section and maximum weight are just the mean ME and maximum ME,
+  // but eventually the sampling weights WG (e.g. from Rambo) must also be taken into account in the calculation.
+  // The implementation uses differences to reference values to improve numerical precision.
+  struct EventStatistics
+  {
+  public:
+    size_t nevtALL;   // total number of events used
+    size_t nevtABN;   // number of events used, where ME is abnormal (nevtABN <= nevtALL)
+    size_t nevtZERO;  // number of not-abnormal events used, where ME is zero (nevtZERO <= nevtOK)
+    double minME;     // minimum matrix element
+    double maxME;     // maximum matrix element
+    double minWG;     // minimum sampling weight
+    double maxWG;     // maximum sampling weight
+    double refME;     // "reference" matrix element (normally the current mean)
+    double refWG;     // "reference" sampling weight (normally the current mean)
+    double sumMEdiff; // sum of diff to ref for matrix element
+    double sumWGdiff; // sum of diff to ref for sampling weight
+    double sqsMEdiff; // squared sum of diff to ref for matrix element
+    double sqsWGdiff; // squared sum of diff to ref for sampling weight
+    std::string tag;  // a text tag for printouts
+    // Number of events used, where ME is not abnormal
+    size_t nevtOK() const { return nevtALL - nevtABN; }
+    // Mean matrix element
+    // [x = ref+d => mean(x) = sum(x)/n = ref+sum(d)/n]
+    double meanME() const
+    {
+      return refME + ( nevtOK() > 0 ? sumMEdiff / nevtOK() : 0 );
+    }
+    // Mean sampling weight
+    // [x = ref+d => mean(x) = sum(x)/n = ref+sum(d)/n]
+    double meanWG() const
+    {
+      return refWG + ( nevtOK() > 0 ? sumWGdiff / nevtOK() : 0 );
+    }
+    // Variance matrix element
+    // [x = ref+d => n*var(x) = sum((x-mean(x))^2) = sum((ref+d-ref-sum(d)/n)^2) = sum((d-sum(d)/n)^2)/n = sum(d^2)-(sum(d))^2/n]
+    double varME() const { return ( sqsMEdiff - std::pow( sumMEdiff, 2 ) / nevtOK() ) / nevtOK(); }
+    // Variance sampling weight
+    // [x = ref+d => n*var(x) = sum((x-mean(x))^2) = sum((ref+d-ref-sum(d)/n)^2) = sum((d-sum(d)/n)^2)/n = sum(d^2)-(sum(d))^2/n]
+    double varWG() const { return ( sqsWGdiff - std::pow( sumWGdiff, 2 ) / nevtOK() ) / nevtOK(); }
+    // Standard deviation matrix element
+    double stdME() const { return std::sqrt( varME() ); }
+    // Standard deviation sampling weight
+    double stdWG() const { return std::sqrt( varWG() ); }
+    // Update reference matrix element
+    void updateRefME( const double newRef )
+    {
+      const double deltaRef = refME - newRef;
+      sqsMEdiff += deltaRef * ( 2 * sumMEdiff + nevtOK() * deltaRef );
+      sumMEdiff += deltaRef * nevtOK();
+      refME = newRef;
+    }
+    // Update reference sampling weight
+    void updateRefWG( const double newRef )
+    {
+      const double deltaRef = refWG - newRef;
+      sqsWGdiff += deltaRef * ( 2 * sumWGdiff + nevtOK() * deltaRef );
+      sumWGdiff += deltaRef * nevtOK();
+      refWG = newRef;
+    }
+    // Constructor
+    EventStatistics()
+      : nevtALL( 0 )
+      , nevtABN( 0 )
+      , nevtZERO( 0 )
+      , minME( std::numeric_limits<double>::max() )
+      , maxME( std::numeric_limits<double>::lowest() )
+      , minWG( std::numeric_limits<double>::max() )
+      , maxWG( std::numeric_limits<double>::lowest() )
+      , refME( 0 )
+      , refWG( 0 )
+      , sumMEdiff( 0 )
+      , sumWGdiff( 0 )
+      , sqsMEdiff( 0 )
+      , sqsWGdiff( 0 )
+      , tag( "" ) {}
+    // Combine two EventStatistics
+#ifdef __clang__
+    // Disable optimizations for this function in HIP (work around FPE crash #1003: originally using #if __HIP_CLANG_ONLY__)
+    // Disable optimizations for this function in clang tout court (work around FPE crash #1005: now using #ifdef __clang__)
+    // See https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-selectively-disabling-optimization
+    __attribute__( ( optnone ) )
+#endif
+    EventStatistics&
+    operator+=( const EventStatistics& stats )
+    {
+      EventStatistics s1 = *this; // temporary copy
+      EventStatistics s2 = stats; // temporary copy
+      EventStatistics& sum = *this;
+      sum.nevtALL = s1.nevtALL + s2.nevtALL;
+      sum.nevtABN = s1.nevtABN + s2.nevtABN;
+      sum.nevtZERO = s1.nevtZERO + s2.nevtZERO;
+      sum.minME = std::min( s1.minME, s2.minME );
+      sum.maxME = std::max( s1.maxME, s2.maxME );
+      sum.minWG = std::min( s1.minWG, s2.minWG );
+      sum.maxWG = std::max( s1.maxWG, s2.maxWG );
+      sum.refME = ( s1.meanME() * s1.nevtOK() + s2.meanME() * s2.nevtOK() ) / sum.nevtOK(); // new mean ME
+      s1.updateRefME( sum.refME );
+      s2.updateRefME( sum.refME );
+      sum.sumMEdiff = s1.sumMEdiff + s2.sumMEdiff;
+      sum.sqsMEdiff = s1.sqsMEdiff + s2.sqsMEdiff;
+      sum.refWG = ( s1.meanWG() * s1.nevtOK() + s2.meanWG() * s2.nevtOK() ) / sum.nevtOK(); // new mean WG
+      s1.updateRefWG( sum.refWG );
+      s2.updateRefWG( sum.refWG );
+      sum.sumWGdiff = s1.sumWGdiff + s2.sumWGdiff;
+      sum.sqsWGdiff = s1.sqsWGdiff + s2.sqsWGdiff;
+      return sum;
+    }
+    // Printout
+    void printout( std::ostream& out ) const
+    {
+      const EventStatistics& s = *this;
+      constexpr int meGeVexponent = -( 2 * CPPProcess::npar - 8 );
+      out << s.tag << "NumMatrixElems(notAbnormal) = " << s.nevtOK() << std::endl
+          << std::scientific // fixed format: affects all floats (default precision: 6)
+          << s.tag << "MeanMatrixElemValue         = ( " << s.meanME()
+          << " +- " << s.stdME() / std::sqrt( s.nevtOK() ) << " )  GeV^" << meGeVexponent << std::endl // standard error
+          << s.tag << "[Min,Max]MatrixElemValue    = [ " << s.minME
+          << " ,  " << s.maxME << " ]  GeV^" << meGeVexponent << std::endl
+          << s.tag << "StdDevMatrixElemValue       = ( " << s.stdME()
+          << std::string( 16, ' ' ) << " )  GeV^" << meGeVexponent << std::endl
+          << s.tag << "MeanWeight                  = ( " << s.meanWG()
+          << " +- " << s.stdWG() / std::sqrt( s.nevtOK() ) << std::endl // standard error
+          << s.tag << "[Min,Max]Weight             = [ " << s.minWG
+          << " ,  " << s.maxWG << " ]" << std::endl
+          << s.tag << "StdDevWeight                = ( " << s.stdWG()
+          << std::string( 16, ' ' ) << " )" << std::endl
+          << std::defaultfloat; // default format: affects all floats
+    }
+  };
+
+  //--------------------------------------------------------------------------
+
+  inline std::ostream& operator<<( std::ostream& out, const EventStatistics& s )
+  {
+    s.printout( out );
+    return out;
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // EventStatistics_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
new file mode 100644
index 0000000000..026253f354
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -0,0 +1,165 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPUABSTRACTION_H
+#define MG5AMC_GPUABSTRACTION_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include <cassert>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
+#define gpuError_t cudaError_t
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuSuccess cudaSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) )
+#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
+
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
+#define gpuSetDevice cudaSetDevice
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuDeviceReset cudaDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
+
+//--------------------------------------------------------------------------
+
+#elif defined __HIPCC__
+
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
+#define gpuError_t hipError_t
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuSuccess hipSuccess
+
+#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better
+#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
+
+#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
+
+#define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
+#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
+
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
+#define gpuSetDevice hipSetDevice
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuDeviceReset hipDeviceReset
+
+#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) )
+#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) )
+#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
+#endif
+
+#endif // MG5AMC_GPUABSTRACTION_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
new file mode 100644
index 0000000000..086aa6a616
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -0,0 +1,101 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef MG5AMC_GPURUNTIME_H
+#define MG5AMC_GPURUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include "GpuAbstraction.h"
+
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); }
+inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != gpuSuccess )
+  {
+    printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == gpuSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+  // Instantiate a GpuRuntime at the beginnining of the application's main to
+  // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct GpuRuntime final
+  {
+    GpuRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~GpuRuntime() { tearDown( m_debug ); }
+    GpuRuntime( const GpuRuntime& ) = delete;
+    GpuRuntime( GpuRuntime&& ) = delete;
+    GpuRuntime& operator=( const GpuRuntime& ) = delete;
+    GpuRuntime& operator=( GpuRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl;
+      checkGpu( gpuSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
+    {
+      if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
+      checkGpu( gpuDeviceReset() );
+    }
+  };
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_GPURUNTIME_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc
new file mode 100644
index 0000000000..2e4534f9d4
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc
@@ -0,0 +1,145 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+
+#include "mgOnGpuConfig.h"
+
+#include "GpuRuntime.h"
+#include "MemoryBuffers.h"
+#include "RandomNumberKernels.h"
+
+#include <cassert>
+
+#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */
+#ifndef __HIP_PLATFORM_AMD__
+#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand)
+#endif
+#include <hiprand/hiprand.h>
+#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); }
+inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != HIPRAND_STATUS_SUCCESS )
+  {
+    printf( "HiprandAssert: %s:%d code=%d\n", file, line, code );
+    if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS );
+  }
+}
+#endif /* clang-format on */
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+#ifndef MGONGPU_HAS_NO_HIPRAND
+  HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice )
+    : RandomNumberKernelBase( rnarray )
+    , m_isOnDevice( onDevice )
+  {
+    if( m_isOnDevice )
+    {
+#ifdef MGONGPUCPP_GPUIMPL
+      if( !m_rnarray.isOnDevice() )
+        throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" );
+#else
+      throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" );
+#endif
+    }
+    else
+    {
+      if( m_rnarray.isOnDevice() )
+        throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" );
+    }
+    createGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  HiprandRandomNumberKernel::~HiprandRandomNumberKernel()
+  {
+    destroyGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed )
+  {
+    if( m_isOnDevice )
+    {
+      destroyGenerator(); // workaround for #429
+      createGenerator();  // workaround for #429
+    }
+    //printf( "seedGenerator: seed %d\n", seed );
+    checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void HiprandRandomNumberKernel::createGenerator()
+  {
+    //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT;
+    //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW;
+    //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A;
+    const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code
+    //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937;
+    //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10;
+    if( m_isOnDevice )
+    {
+      checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) );
+    }
+    else
+    {
+      // See https://github.com/ROCm/hipRAND/issues/76
+      throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" );
+      //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000
+    }
+    // FIXME: hiprand ordering is not implemented yet
+    // See https://github.com/ROCm/hipRAND/issues/75
+    /*
+    //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) );
+    checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) );
+    //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) );
+    //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) );
+    */
+  }
+
+  //--------------------------------------------------------------------------
+
+  void HiprandRandomNumberKernel::destroyGenerator()
+  {
+    checkHiprand( hiprandDestroyGenerator( m_rnGen ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void HiprandRandomNumberKernel::generateRnarray()
+  {
+#if defined MGONGPU_FPTYPE_DOUBLE
+    checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#elif defined MGONGPU_FPTYPE_FLOAT
+    checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#endif
+    /*
+    printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
+    fptype* data = m_rnarray.data();
+#ifdef MGONGPUCPP_GPUIMPL
+    if( m_rnarray.isOnDevice() )
+    {
+      data = new fptype[m_rnarray.size()]();
+      checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) );
+    }
+#endif
+    for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
+      printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
+#ifdef MGONGPUCPP_GPUIMPL
+    if( m_rnarray.isOnDevice() ) delete[] data;
+#endif
+    */
+  }
+
+  //--------------------------------------------------------------------------
+#endif
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
new file mode 100644
index 0000000000..a278f8849b
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -0,0 +1,340 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MADGRAPHTEST_H_
+#define MADGRAPHTEST_H_ 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <cmath>
+//#ifdef __HIPCC__
+//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
+//#else
+//#include <filesystem> // bypass this completely to ease portability on LUMI #803
+//#endif
+#include <fstream>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using mg5amcGpu::CPPProcess;
+#else
+using mg5amcCpu::CPPProcess;
+#endif
+
+namespace
+{
+  struct ReferenceData
+  {
+    std::vector<std::vector<std::array<fptype, CPPProcess::np4>>> momenta;
+    std::vector<fptype> MEs;
+    std::vector<int> ChanIds;
+    std::vector<int> SelHels;
+    std::vector<int> SelCols;
+  };
+
+  /// Read batches of reference data from a file and store them in a map.
+  std::map<unsigned int, ReferenceData> readReferenceData( const std::string& refFileName )
+  {
+    std::cout << "INFO: Opening reference file " << refFileName << std::endl;
+    std::ifstream referenceFile( refFileName.c_str() );
+    EXPECT_TRUE( referenceFile.is_open() ) << refFileName;
+    std::map<unsigned int, ReferenceData> referenceData;
+    unsigned int evtNo;
+    unsigned int batchNo;
+    for( std::string line; std::getline( referenceFile, line ); )
+    {
+      std::stringstream lineStr( line );
+      if( line.empty() || line[0] == '#' )
+      {
+        continue;
+      }
+      else if( line.find( "Event" ) != std::string::npos )
+      {
+        std::string dummy;
+        lineStr >> dummy >> evtNo >> dummy >> batchNo;
+      }
+      else if( line.find( "ME" ) != std::string::npos )
+      {
+        if( evtNo <= referenceData[batchNo].MEs.size() )
+          referenceData[batchNo].MEs.resize( evtNo + 1 );
+        std::string dummy;
+        lineStr >> dummy >> referenceData[batchNo].MEs[evtNo];
+      }
+      else if( line.find( "ChanId" ) != std::string::npos )
+      {
+        if( evtNo <= referenceData[batchNo].ChanIds.size() )
+          referenceData[batchNo].ChanIds.resize( evtNo + 1 );
+        std::string dummy;
+        lineStr >> dummy >> referenceData[batchNo].ChanIds[evtNo];
+#ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+        referenceData[batchNo].ChanIds[evtNo] = 0; // disable ChanId comparison if multichannel is not supported #976
+#endif
+      }
+      else if( line.find( "SelHel" ) != std::string::npos )
+      {
+        if( evtNo <= referenceData[batchNo].SelHels.size() )
+          referenceData[batchNo].SelHels.resize( evtNo + 1 );
+        std::string dummy;
+        lineStr >> dummy >> referenceData[batchNo].SelHels[evtNo];
+      }
+      else if( line.find( "SelCol" ) != std::string::npos )
+      {
+        if( evtNo <= referenceData[batchNo].SelCols.size() )
+          referenceData[batchNo].SelCols.resize( evtNo + 1 );
+        std::string dummy;
+        lineStr >> dummy >> referenceData[batchNo].SelCols[evtNo];
+      }
+      else
+      {
+        unsigned int particleIndex;
+        lineStr >> particleIndex;
+        if( evtNo <= referenceData[batchNo].momenta.size() )
+          referenceData[batchNo].momenta.resize( evtNo + 1 );
+        if( particleIndex <= referenceData[batchNo].momenta[evtNo].size() )
+          referenceData[batchNo].momenta[evtNo].resize( particleIndex + 1 );
+        auto& fourVec = referenceData[batchNo].momenta[evtNo][particleIndex];
+        for( unsigned int i = 0; i < fourVec.size(); ++i )
+        {
+          EXPECT_TRUE( lineStr.good() );
+          lineStr >> fourVec[i];
+        }
+        EXPECT_TRUE( lineStr.eof() );
+      }
+    }
+    return referenceData;
+  }
+
+}
+
+/**
+ * Test driver providing a common interface for testing different implementations.
+ * Users need to implement:
+ * - Functions to retrieve matrix element and 4-momenta. These are used in the tests.
+ * - Driver functions that run the madgraph workflow.
+ */
+class TestDriverBase
+{
+  std::string m_refFileName;
+public:
+  const unsigned int nparticle;
+  static constexpr unsigned int niter = 2;
+  static constexpr unsigned int gpublocks = 2;
+  static constexpr unsigned int gputhreads = 128;
+  static constexpr unsigned int nevt = gpublocks * gputhreads;
+
+  TestDriverBase( unsigned int npart, const std::string& refFileName )
+    : m_refFileName( refFileName )
+    , nparticle( npart )
+  {
+  }
+  TestDriverBase() = delete;
+  virtual ~TestDriverBase() {}
+  const std::string& getRefFileName() { return m_refFileName; }
+
+  // ------------------------------------------------
+  // Interface for retrieving info from madgraph
+  // ------------------------------------------------
+  virtual fptype getMomentum( std::size_t evtNo, unsigned int particleNo, unsigned int component ) const = 0;
+  virtual fptype getMatrixElement( std::size_t evtNo ) const = 0;
+  virtual int getChannelId( std::size_t ievt ) const = 0;
+  virtual int getSelectedHelicity( std::size_t ievt ) const = 0;
+  virtual int getSelectedColor( std::size_t ievt ) const = 0;
+
+  // ------------------------------------------------
+  // Interface for steering madgraph run
+  // ------------------------------------------------
+  virtual void prepareRandomNumbers( unsigned int iiter ) = 0;
+  virtual void prepareMomenta( fptype energy ) = 0;
+  virtual void runSigmaKin( std::size_t iiter ) = 0;
+
+  /// Print the requested event into the stream. If the reference data has enough events, it will be printed as well.
+  void dumpParticles( std::ostream& stream, std::size_t ievt, unsigned int numParticles, unsigned int nDigit, const ReferenceData& referenceData ) const
+  {
+    const auto width = nDigit + 8;
+    for( unsigned int ipar = 0; ipar < numParticles; ipar++ )
+    {
+      // NB: 'setw' affects only the next field (of any type)
+      stream << std::scientific // fixed format: affects all floats (default nDigit: 6)
+             << std::setprecision( nDigit )
+             << std::setw( 4 ) << ipar
+             << std::setw( width ) << getMomentum( ievt, ipar, 0 )
+             << std::setw( width ) << getMomentum( ievt, ipar, 1 )
+             << std::setw( width ) << getMomentum( ievt, ipar, 2 )
+             << std::setw( width ) << getMomentum( ievt, ipar, 3 )
+             << "\n";
+      if( ievt < referenceData.momenta.size() )
+      {
+        stream << "ref" << ipar;
+        stream << std::setw( width ) << referenceData.momenta[ievt][ipar][0]
+               << std::setw( width ) << referenceData.momenta[ievt][ipar][1]
+               << std::setw( width ) << referenceData.momenta[ievt][ipar][2]
+               << std::setw( width ) << referenceData.momenta[ievt][ipar][3]
+               << "\n\n";
+      }
+      stream << std::flush << std::defaultfloat; // default format: affects all floats
+    }
+  }
+};
+
+/**
+ * Test class that's defining all tests to run with a Madgraph workflow.
+ */
+class MadgraphTest
+{
+public:
+  MadgraphTest( TestDriverBase& testDriverRef )
+    : testDriver( &testDriverRef ) {}
+  ~MadgraphTest() {}
+  void CompareMomentaAndME( testing::Test& googleTest ) const; // NB: googleTest is ONLY needed for the HasFailure method...
+private:
+  TestDriverBase* testDriver; // non-owning pointer
+};
+
+void
+MadgraphTest::CompareMomentaAndME( testing::Test& googleTest ) const
+{
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
+#ifdef __APPLE__
+  const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
+#else
+  //const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3; // fails smeft/hip #843
+  const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-3;
+#endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" );
+  const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" );
+  const std::string refFileName = testDriver->getRefFileName();
+  /*
+#ifdef __HIPCC__
+  const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename();
+#else
+  const std::string dumpFileName = std::filesystem::path( refFileName ).filename();
+#endif
+  */
+  const std::string dumpFileName = refFileName; // bypass std::filesystem #803
+  std::ofstream dumpFile;
+  if( dumpEvents )
+  {
+    dumpFile.open( dumpFileName, std::ios::trunc );
+  }
+  // Read reference data
+  std::map<unsigned int, ReferenceData> referenceData;
+  if( !dumpEvents )
+  {
+    referenceData = readReferenceData( refFileName );
+  }
+  ASSERT_FALSE( googleTest.HasFailure() ); // It doesn't make any sense to continue if we couldn't read the reference file.
+  // **************************************
+  // *** START MAIN LOOP ON #ITERATIONS ***
+  // **************************************
+  for( unsigned int iiter = 0; iiter < testDriver->niter; ++iiter )
+  {
+    testDriver->prepareRandomNumbers( iiter );
+    testDriver->prepareMomenta( energy );
+    testDriver->runSigmaKin( iiter );
+    // --- Run checks on all events produced in this iteration
+    for( std::size_t ievt = 0; ievt < testDriver->nevt && !googleTest.HasFailure(); ++ievt )
+    {
+      if( dumpEvents )
+      {
+        ASSERT_TRUE( dumpFile.is_open() ) << dumpFileName;
+        dumpFile << "Event " << std::setw( 8 ) << ievt << "  "
+                 << "Batch " << std::setw( 4 ) << iiter << "\n";
+        testDriver->dumpParticles( dumpFile, ievt, testDriver->nparticle, 15, ReferenceData() );
+        // Dump matrix element
+        dumpFile << std::setw( 4 ) << "ME" << std::scientific << std::setw( 15 + 8 )
+                 << testDriver->getMatrixElement( ievt ) << "\n"
+                 << std::defaultfloat;
+        // Dump channelId
+        dumpFile << "ChanId" << std::setw( 8 ) << testDriver->getChannelId( ievt ) << "\n";
+        // Dump selected helicity and color
+        dumpFile << "SelHel" << std::setw( 8 ) << testDriver->getSelectedHelicity( ievt ) << "\n";
+        dumpFile << "SelCol" << std::setw( 8 ) << testDriver->getSelectedColor( ievt ) << "\n"
+                 << std::endl; // leave one line between events
+        continue;
+      }
+      // Check that we have the required reference data
+      ASSERT_GT( referenceData.size(), iiter )
+        << "Don't have enough reference data for iteration " << iiter << ". Ref file:" << refFileName;
+      ASSERT_GT( referenceData[iiter].MEs.size(), ievt )
+        << "Don't have enough reference MEs for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GT( referenceData[iiter].ChanIds.size(), ievt )
+        << "Don't have enough reference ChanIds for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GT( referenceData[iiter].SelHels.size(), ievt )
+        << "Don't have enough reference SelHels for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GT( referenceData[iiter].SelCols.size(), ievt )
+        << "Don't have enough reference SelCols for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GT( referenceData[iiter].momenta.size(), ievt )
+        << "Don't have enough reference momenta for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GE( referenceData[iiter].momenta[ievt].size(), testDriver->nparticle )
+        << "Don't have enough reference particles for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      // This trace will help to understand the event that is being checked.
+      // It will only be printed in case of failures:
+      std::stringstream eventTrace;
+      eventTrace << "In comparing event " << ievt << " from iteration " << iiter << "\n";
+      testDriver->dumpParticles( eventTrace, ievt, testDriver->nparticle, 15, referenceData[iiter] );
+      eventTrace << std::setw( 4 ) << "ME" << std::scientific << std::setw( 15 + 8 )
+                 << testDriver->getMatrixElement( ievt ) << "\n"
+                 << std::setw( 4 ) << "r.ME" << std::scientific << std::setw( 15 + 8 )
+                 << referenceData[iiter].MEs[ievt] << std::endl
+                 << std::defaultfloat;
+      eventTrace << std::setw( 8 ) << "ChanId" << std::setw( 8 ) << testDriver->getChannelId( ievt ) << "\n"
+                 << std::setw( 8 ) << "r.ChanId" << std::setw( 8 ) << referenceData[iiter].ChanIds[ievt] << std::endl;
+      eventTrace << std::setw( 8 ) << "SelHel" << std::setw( 8 ) << testDriver->getSelectedHelicity( ievt ) << "\n"
+                 << std::setw( 8 ) << "r.SelHel" << std::setw( 8 ) << referenceData[iiter].SelHels[ievt] << std::endl;
+      eventTrace << std::setw( 8 ) << "SelCol" << std::setw( 8 ) << testDriver->getSelectedColor( ievt ) << "\n"
+                 << std::setw( 8 ) << "r.SelCol" << std::setw( 8 ) << referenceData[iiter].SelCols[ievt] << std::endl;
+      SCOPED_TRACE( eventTrace.str() );
+      // Compare Momenta
+      for( unsigned int ipar = 0; ipar < testDriver->nparticle; ++ipar )
+      {
+        std::stringstream momentumErrors;
+        for( unsigned int icomp = 0; icomp < CPPProcess::np4; ++icomp )
+        {
+          const fptype pMadg = testDriver->getMomentum( ievt, ipar, icomp );
+          const fptype pOrig = referenceData[iiter].momenta[ievt][ipar][icomp];
+          //const fptype relDelta = fabs( ( pMadg - pOrig ) / pOrig ); // computing relDelta may lead to FPEs
+          const fptype delta = fabs( pMadg - pOrig );
+          if( delta > toleranceMomenta * fabs( pOrig ) ) // better than "relDelta > toleranceMomenta"
+          {
+            momentumErrors << std::setprecision( 15 ) << std::scientific << "\nparticle " << ipar << "\tcomponent " << icomp
+                           << "\n\t madGraph:  " << std::setw( 22 ) << pMadg
+                           << "\n\t reference: " << std::setw( 22 ) << pOrig
+                           << "\n\t relative delta exceeds tolerance of " << toleranceMomenta;
+          }
+        }
+        ASSERT_TRUE( momentumErrors.str().empty() ) << momentumErrors.str();
+      }
+      // Compare ME:
+      EXPECT_NEAR( testDriver->getMatrixElement( ievt ),
+                   referenceData[iiter].MEs[ievt],
+                   toleranceMEs * referenceData[iiter].MEs[ievt] );
+      // Compare channelId
+      EXPECT_EQ( testDriver->getChannelId( ievt ),
+                 referenceData[iiter].ChanIds[ievt] );
+      // Compare selected helicity and color
+      EXPECT_EQ( testDriver->getSelectedHelicity( ievt ),
+                 referenceData[iiter].SelHels[ievt] );
+      EXPECT_EQ( testDriver->getSelectedColor( ievt ),
+                 referenceData[iiter].SelCols[ievt] );
+    }
+  }
+  if( dumpEvents )
+  {
+    std::cout << "Event dump written to " << dumpFileName << std::endl;
+  }
+}
+
+#endif /* MADGRAPHTEST_H_ */
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
new file mode 100644
index 0000000000..c4ba05cb42
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -0,0 +1,537 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, J. Teig, A. Thete, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
+
+#include "MatrixElementKernels.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cfenv> // for fetestexcept
+#include <iostream>
+#include <sstream>
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelBase::MatrixElementKernelBase( const BufferMomenta& momenta,         // input: momenta
+                                                    const BufferGs& gs,                   // input: gs for alphaS
+                                                    const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                                                    const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                                    const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                                    const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                                    BufferMatrixElements& matrixElements, // output: matrix elements
+                                                    BufferSelectedHelicity& selhel,       // output: helicity selection
+                                                    BufferSelectedColor& selcol )         // output: color selection
+    : m_momenta( momenta )
+    , m_gs( gs )
+    , m_iflavorVec( iflavorVec )
+    , m_rndhel( rndhel )
+    , m_rndcol( rndcol )
+    , m_channelIds( channelIds )
+    , m_matrixElements( matrixElements )
+    , m_selhel( selhel )
+    , m_selcol( selcol )
+#ifdef MGONGPU_CHANNELID_DEBUG
+    , m_nevtProcessedByChannel()
+    , m_tag()
+#endif
+  {
+    //std::cout << "DEBUG: MatrixElementKernelBase ctor " << this << std::endl;
+#ifdef MGONGPU_CHANNELID_DEBUG
+    for( size_t channelId = 0; channelId < CPPProcess::ndiagrams + 1; channelId++ ) // [0...ndiagrams] (TEMPORARY: 0=multichannel)
+      m_nevtProcessedByChannel[channelId] = 0;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelBase::~MatrixElementKernelBase()
+  {
+    //std::cout << "DEBUG: MatrixElementKernelBase dtor " << this << std::endl;
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::dumpNevtProcessedByChannel();
+#endif
+#ifdef MGONGPUCPP_VERBOSE
+    MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_CHANNELID_DEBUG
+  void MatrixElementKernelBase::updateNevtProcessedByChannel( const unsigned int* pHstChannelIds, const size_t nevt )
+  {
+    if( pHstChannelIds != nullptr )
+    {
+      //std::cout << "DEBUG " << this << ": not nullptr " << nevt << std::endl;
+      for( unsigned int ievt = 0; ievt < nevt; ievt++ )
+      {
+        const size_t channelId = pHstChannelIds[ievt]; // Fortran indexing
+        //assert( channelId > 0 );
+        //assert( channelId < CPPProcess::ndiagrams );
+        m_nevtProcessedByChannel[channelId]++;
+      }
+    }
+    else
+    {
+      //std::cout << "DEBUG " << this << ": nullptr " << std::endl;
+      m_nevtProcessedByChannel[0] += nevt;
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_CHANNELID_DEBUG
+  void MatrixElementKernelBase::dumpNevtProcessedByChannel()
+  {
+    size_t nevtProcessed = 0;
+    for( size_t channelId = 0; channelId < CPPProcess::ndiagrams + 1; channelId++ ) // [0...ndiagrams] (TEMPORARY: 0=multichannel)
+      nevtProcessed += m_nevtProcessedByChannel[channelId];
+    std::ostringstream sstr;
+    sstr << " {";
+    for( size_t channelId = 0; channelId < CPPProcess::ndiagrams + 1; channelId++ ) // [0...ndiagrams] (TEMPORARY: 0=multichannel)
+    {
+      if( m_nevtProcessedByChannel[channelId] > 0 )
+      {
+        if( sstr.str() != " {" ) sstr << ",";
+        if( channelId == 0 )
+          sstr << " no-multichannel";
+        else
+          sstr << " " << channelId;
+        sstr << " : " << m_nevtProcessedByChannel[channelId];
+      }
+    }
+    sstr << " }";
+    std::cout << "DEBUG: MEK " << this;
+    if( m_tag != "" ) std::cout << " " << m_tag;
+    std::cout << " processed " << nevtProcessed << " events across " << CPPProcess::ndiagrams << " channels" << sstr.str() << std::endl;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelBase::dumpSignallingFPEs()
+  {
+    // New strategy for issue #831: add a final report of FPEs
+    // Note: normally only underflow will be reported here (inexact is switched off because it would almost always signal;
+    // divbyzero, invalid and overflow are configured by feenablexcept to send a SIGFPE signal, and are normally fixed in the code)
+    // Note: this is now called in the individual destructors of MEK classes rather than in that of MatrixElementKernelBase(#837)
+    std::string fpes;
+    if( std::fetestexcept( FE_DIVBYZERO ) ) fpes += " FE_DIVBYZERO";
+    if( std::fetestexcept( FE_INVALID ) ) fpes += " FE_INVALID";
+    if( std::fetestexcept( FE_OVERFLOW ) ) fpes += " FE_OVERFLOW";
+    if( std::fetestexcept( FE_UNDERFLOW ) ) fpes += " FE_UNDERFLOW";
+    //if( std::fetestexcept( FE_INEXACT ) ) fpes += " FE_INEXACT"; // do not print this out: this would almost always signal!
+    if( fpes == "" )
+      std::cout << "INFO: No Floating Point Exceptions have been reported" << std::endl;
+    else
+      std::cerr << "INFO: The following Floating Point Exceptions have been reported:" << fpes << std::endl;
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+//============================================================================
+
+#ifndef MGONGPUCPP_GPUIMPL
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHost::MatrixElementKernelHost( const BufferMomenta& momenta,         // input: momenta
+                                                    const BufferGs& gs,                   // input: gs for alphaS
+                                                    const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                                                    const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                                    const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                                    const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                                    BufferMatrixElements& matrixElements, // output: matrix elements
+                                                    BufferSelectedHelicity& selhel,       // output: helicity selection
+                                                    BufferSelectedColor& selcol,          // output: color selection
+                                                    const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt * CPPProcess::ndiagrams )
+    , m_denominators( nevt )
+#endif
+  {
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with channelIds" );
+    if( this->nevt() != m_iflavorVec.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with iflavorVec" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHost: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
+    // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
+    // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
+    if( !MatrixElementKernelHost::hostSupportsSIMD() )
+      throw std::runtime_error( "Host does not support the SIMD implementation of MatrixElementKernelsHost" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHost::~MatrixElementKernelHost()
+  {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
+  }
+
+  //--------------------------------------------------------------------------
+
+  int MatrixElementKernelHost::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask on the host
+    computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() );
+#else
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
+#endif
+    // ... 0d2. Copy good helicity list to static memory on the host
+    // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
+    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHost::computeMatrixElements( const bool useChannelIds )
+  {
+    computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
+#else
+    assert( useChannelIds == false );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Does this host system support the SIMD used in the matrix element calculation?
+  bool MatrixElementKernelHost::hostSupportsSIMD( const bool verbose )
+  {
+#if defined __AVX512VL__
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "avx512vl" );
+    const std::string tag = "skylake-avx512 (AVX512VL)";
+#elif defined __AVX2__
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "avx2" );
+    const std::string tag = "haswell (AVX2)";
+#elif defined __SSE4_2__
+#ifdef __PPC__
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "vsx" );
+    const std::string tag = "powerpc vsx (128bit as in SSE4.2)";
+#elif defined __ARM_NEON__ // consider using __BUILTIN_CPU_SUPPORTS__
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
+#elif defined( __x86_64__ ) || defined( __i386__ )
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "sse4.2" );
+    const std::string tag = "nehalem (SSE4.2)";
+#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
+#endif
+#else
+    bool known = true;
+    bool ok = true;
+    const std::string tag = "none";
+#endif
+    if( verbose )
+    {
+      if( tag == "none" )
+        std::cout << "INFO: The application does not require the host to support any AVX feature" << std::endl;
+      else if( ok && known )
+        std::cout << "INFO: The application is built for " << tag << " and the host supports it" << std::endl;
+      else if( ok )
+        std::cout << "WARNING: The application is built for " << tag << " but it is unknown if the host supports it" << std::endl;
+      else
+        std::cout << "ERROR! The application is built for " << tag << " but the host does not support it" << std::endl;
+    }
+    return ok;
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelDevice::MatrixElementKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                                                        const BufferGs& gs,                   // input: gs for alphaS
+                                                        const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                                                        const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                                        const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                                        const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                                        BufferMatrixElements& matrixElements, // output: matrix elements
+                                                        BufferSelectedHelicity& selhel,       // output: helicity selection
+                                                        BufferSelectedColor& selcol,          // output: color selection
+                                                        const size_t gpublocks,
+                                                        const size_t gputhreads)
+    : MatrixElementKernelBase( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( gpublocks * gputhreads )
+    , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    , m_hstChannelIds( this->nevt() )
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_blasHandle()
+#endif
+    , m_helStreams()
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
+    if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
+    if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
+    if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
+    if( !m_iflavorVec.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: iflavorVec must be a device array" );
+    if( m_gpublocks == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gpublocks must be > 0" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gputhreads must be > 0" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with channelIds" );
+    if( this->nevt() != m_iflavorVec.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with iflavorVec" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( m_gputhreads % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelDevice::~MatrixElementKernelDevice()
+  {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+#ifndef MGONGPU_HAS_NO_BLAS
+    if( m_blasHandle ) gpuBlasDestroy( m_blasHandle );
+#endif
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  // FIXME! The relevance of this function should be reassessed (#543 and #902)
+  void MatrixElementKernelDevice::setGrid( const int /*gpublocks*/, const int /*gputhreads*/ )
+  {
+    if( m_gpublocks == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gpublocks must be > 0 in setGrid" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gputhreads must be > 0 in setGrid" );
+    if( this->nevt() != m_gpublocks * m_gputhreads ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch in setGrid" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int MatrixElementKernelDevice::computeGoodHelicities()
+  {
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
+#else
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
+#endif
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity (attached to the default stream)
+    if( m_blasColorSum )
+    {
+      checkGpuBlas( gpuBlasCreate( &m_blasHandle ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      if( m_blasTf32Tensor )
+        checkGpuBlas( cublasSetMathMode( m_blasHandle, CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+    }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
+  {
+    gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr );
+#else
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* pBlasHandle = nullptr;
+#endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+#else
+    assert( useChannelIds == false );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
+    copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?!
+    const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
+#endif
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
new file mode 100644
index 0000000000..693eeff489
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -0,0 +1,253 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, J. Teig, A. Thete, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef MATRIXELEMENTKERNELS_H
+#define MATRIXELEMENTKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+#include "MemoryBuffers.h"
+
+#include <map>
+#include <memory>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating matrix element calculations on a CPU host or on a GPU device
+  class MatrixElementKernelBase //: virtual public IMatrixElementKernel
+  {
+  protected:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelBase( const BufferMomenta& momenta,         // input: momenta
+                             const BufferGs& gs,                   // input: gs for alphaS
+                             const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                             const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                             const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                             const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                             BufferMatrixElements& matrixElements, // output: matrix elements
+                             BufferSelectedHelicity& selhel,       // output: helicity selection
+                             BufferSelectedColor& selcol);          // output: color selection
+
+  public:
+
+    // Destructor
+    virtual ~MatrixElementKernelBase();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    virtual int computeGoodHelicities() = 0;
+
+    // Compute matrix elements
+    virtual void computeMatrixElements( const bool useChannelIds ) = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+    // Dump signalling FPEs (#831 and #837)
+    static void dumpSignallingFPEs();
+
+#ifdef MGONGPU_CHANNELID_DEBUG
+    // Add a MEK identifier for the channelId debug printout
+    void setTagForNevtProcessedByChannel( const std::string& tag ) { m_tag = tag; }
+
+  protected:
+    // Update number of events processed by channel
+    void updateNevtProcessedByChannel( const unsigned int* pHstChannelIds, const size_t nevt );
+
+    // Dump number of events processed by channel
+    void dumpNevtProcessedByChannel();
+#endif
+
+  protected:
+
+    // The buffer for the input momenta
+    const BufferMomenta& m_momenta;
+
+    // The buffer for the gs to calculate the alphaS values
+    const BufferGs& m_gs;
+
+    // The buffer for the flavor indices for the flavor combination
+    const BufferIflavorVec& m_iflavorVec;
+
+    // The buffer for the random numbers for helicity selection
+    const BufferRndNumHelicity& m_rndhel;
+
+    // The buffer for the random numbers for color selection
+    const BufferRndNumColor& m_rndcol;
+
+    // The buffer for the channel ids for single-diagram enhancement
+    const BufferChannelIds& m_channelIds;
+
+    // The buffer for the output matrix elements
+    BufferMatrixElements& m_matrixElements;
+
+    // The buffer for the output helicity selection
+    BufferSelectedHelicity& m_selhel;
+
+    // The buffer for the output color selection
+    BufferSelectedColor& m_selcol;
+
+#ifdef MGONGPU_CHANNELID_DEBUG
+    // The events-per-channel counter for debugging
+    std::map<size_t, size_t> m_nevtProcessedByChannel;
+
+    // The tag for events-per-channel debugging
+    std::string m_tag;
+#endif
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating matrix element calculations on a CPU host
+  class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHost( const BufferMomenta& momenta,         // input: momenta
+                             const BufferGs& gs,                   // input: gs for alphaS
+                             const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                             const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                             const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                             const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                             BufferMatrixElements& matrixElements, // output: matrix elements
+                             BufferSelectedHelicity& selhel,       // output: helicity selection
+                             BufferSelectedColor& selcol,          // output: color selection
+                             const size_t nevt);
+
+    // Destructor
+    virtual ~MatrixElementKernelHost();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+  private:
+
+    // Does this host system support the SIMD used in the matrix element calculation?
+    // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
+
+  private:
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating matrix element calculations on a GPU device
+  class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                               const BufferGs& gs,                   // input: gs for alphaS
+                               const BufferIflavorVec& iflavorVec,   // input: flavor indices for the flavor combination
+                               const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                               const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                               const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                               BufferMatrixElements& matrixElements, // output: matrix elements
+                               BufferSelectedHelicity& selhel,       // output: helicity selection
+                               BufferSelectedColor& selcol,          // output: color selection
+                               const size_t gpublocks,
+                               const size_t gputhreads );
+
+    // Destructor
+    virtual ~MatrixElementKernelDevice();
+
+    // Reset gpublocks and gputhreads
+    void setGrid( const int gpublocks, const int gputhreads );
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    DeviceBufferCouplings m_couplings;
+
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
+
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
+#endif
+
+#ifdef MGONGPU_CHANNELID_DEBUG
+    // The **host** buffer for the channelId array
+    // FIXME? MEKD should accept a host buffer as an argument instead of a device buffer, so that a second copy can be avoided?
+    PinnedHostBufferChannelIds m_hstChannelIds;
+#endif
+
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The cuBLAS/hipBLAS handle (a single one for all good helicities)
+    gpuBlasHandle_t m_blasHandle;
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // MATRIXELEMENTKERNELS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
new file mode 100644
index 0000000000..0d92f69c43
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
@@ -0,0 +1,164 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessAmplitudes_H
+#define MemoryAccessAmplitudes_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "MemoryAccessHelpers.h"
+
+#define MGONGPU_TRIVIAL_AMPLITUDES 1
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+#ifndef MGONGPU_TRIVIAL_AMPLITUDES
+
+  // A class describing the internal layout of memory buffers for amplitudes
+  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
+  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+  class MemoryAccessAmplitudesBase //_AOSOAv1
+  {
+  public:
+
+    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
+    static constexpr int neppA = 1; // AOS (just a test...)
+
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
+    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
+    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
+
+    // The number of floating point components of a complex number
+    static constexpr int nx2 = mgOnGpu::nx2;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      const int ipagA = ievt / neppA; // #event "A-page"
+      const int ieppA = ievt % neppA; // #event in the current event A-page
+      constexpr int ix2 = 0;
+      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer,
+                  const int ix2 )
+    {
+      constexpr int ipagA = 0;
+      constexpr int ieppA = 0;
+      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
+    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
+    static constexpr auto decodeRecordIx2Const =
+      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
+    static constexpr auto ieventAccessIx2 =
+      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
+    static constexpr auto ieventAccessIx2Const =
+      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
+  };
+
+#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessAmplitudes
+  {
+  public:
+
+#ifndef MGONGPU_TRIVIAL_AMPLITUDES
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
+    static constexpr auto kernelAccessIx2 =
+      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+    static constexpr auto kernelAccessIx2Const =
+      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
+
+#else
+
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer );
+    }
+
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
+    {
+      return reinterpret_cast<const cxtype_sv*>( buffer );
+    }
+
+#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessAmplitudes<false> HostAccessAmplitudes;
+  typedef KernelAccessAmplitudes<true> DeviceAccessAmplitudes;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessAmplitudes_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessChannelIds.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessChannelIds.h
new file mode 100644
index 0000000000..326aadb5bc
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessChannelIds.h
@@ -0,0 +1,125 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Dec 2023, based on earlier work by A. Valassi) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessChannelIds_H
+#define MemoryAccessChannelIds_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL // fix #893 (not __CUDACC__)
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for channel ids
+  // This implementation uses a plain ARRAY[nevt]
+  // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+  class MemoryAccessChannelIdsBase //_ARRAYv1
+  {
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessChannelIdsBase, unsigned int>;
+    friend class KernelAccessHelper<MemoryAccessChannelIdsBase, true, unsigned int>;
+    friend class KernelAccessHelper<MemoryAccessChannelIdsBase, false, unsigned int>;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> unsigned int* ieventAccessRecord( unsigned int* buffer, const int ievt ) <===]
+    static __host__ __device__ inline unsigned int*
+    ieventAccessRecord( unsigned int* buffer,
+                        const int ievt )
+    {
+      return &( buffer[ievt] ); // ARRAY[nevt]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> unsigned int& decodeRecord( unsigned int* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+    static __host__ __device__ inline unsigned int&
+    decodeRecord( unsigned int* buffer )
+    {
+      constexpr int ievt = 0;
+      return buffer[ievt]; // ARRAY[nevt]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessChannelIds : public MemoryAccessChannelIdsBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const unsigned int* ieventAccessRecordConst( const unsigned int* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessChannelIdsBase, unsigned int>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const unsigned int& decodeRecordConst( const unsigned int* buffer ) <===]
+    static constexpr auto decodeRecordConst = MemoryAccessHelper<MemoryAccessChannelIdsBase, unsigned int>::template decodeRecordConst<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const unsigned int& ieventAccessConst( const unsigned int* buffer, const ievt ) <===]
+    static constexpr auto ieventAccessConst = MemoryAccessHelper<MemoryAccessChannelIdsBase, unsigned int>::template ieventAccessFieldConst<>;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessChannelIds
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessChannelIds
+    static constexpr auto ieventAccessRecordConst = MemoryAccessChannelIds::ieventAccessRecordConst;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR) ===> const unsigned int& kernelAccessConst( const unsigned int* buffer ) <===]
+    static constexpr auto kernelAccessConst_s = KernelAccessHelper<MemoryAccessChannelIdsBase, onDevice, unsigned int>::template kernelAccessFieldConst<>; // requires cuda 11.4
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (const, SCALAR OR VECTOR) ===> const uint_sv& kernelAccess( const unsigned int* buffer ) <===]
+    static __host__ __device__ inline const uint_sv&
+    kernelAccessConst( const unsigned int* buffer )
+    {
+      const unsigned int& out = kernelAccessConst_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+      static_assert( mg5amcCpu::HostBufferChannelIds::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::uintvFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessChannelIds<false> HostAccessChannelIds;
+  typedef KernelAccessChannelIds<true> DeviceAccessChannelIds;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessChannelIds_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
new file mode 100644
index 0000000000..56f44a5c41
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
@@ -0,0 +1,270 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessCouplings_H
+#define MemoryAccessCouplings_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessMomenta.h" // for MemoryAccessMomentaBase::neppM
+#include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for couplings
+  // This implementation uses an AOSOA[npagC][ndcoup][nx2][neppC] "super-buffer" where nevt=npagC*neppC
+  // From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling
+  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+  class MemoryAccessCouplingsBase //_AOSOAv1
+  {
+  public:
+
+    // Number of Events Per Page in the coupling AOSOA memory buffer layout
+    static constexpr int neppC = MemoryAccessMomentaBase::neppM; // use the same AOSOA striding as for momenta
+
+    // SANITY CHECK: check that neppC is a power of two
+    static_assert( ispoweroftwo( neppC ), "neppC is not a power of 2" );
+
+    //--------------------------------------------------------------------------
+    // ** NB! A single super-buffer AOSOA[npagC][ndcoup][nx2][neppC] includes data for ndcoup different couplings  **
+    // ** NB! The ieventAccessRecord and kernelAccess functions refer to the buffer for one individual coupling    **
+    // ** NB! Use idcoupAccessBuffer to add a fixed offset and locate the buffer for one given individual coupling **
+    //--------------------------------------------------------------------------
+
+    // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input)
+    // [Signature (non-const) ===> fptype* idcoupAccessBuffer( fptype* buffer, const int idcoup ) <===]
+    // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise?
+    static __host__ __device__ inline fptype*
+    idcoupAccessBuffer( fptype* buffer, // input "super-buffer"
+                        const int idcoup )
+    {
+      constexpr int ipagC = 0;
+      constexpr int ieppC = 0;
+      constexpr int ix2 = 0;
+      // NB! this effectively adds an offset "idcoup * nx2 * neppC"
+      return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC]
+    }
+
+    // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input)
+    // [Signature (const) ===> const fptype* idcoupAccessBufferConst( const fptype* buffer, const int idcoup ) <===]
+    // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise?
+    static __host__ __device__ inline const fptype*
+    idcoupAccessBufferConst( const fptype* buffer, // input "super-buffer"
+                             const int idcoup )
+    {
+      return idcoupAccessBuffer( const_cast<fptype*>( buffer ), idcoup );
+    }
+
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessCouplingsBase>;
+    friend class KernelAccessHelper<MemoryAccessCouplingsBase, true>;
+    friend class KernelAccessHelper<MemoryAccessCouplingsBase, false>;
+
+    // The number of couplings that dependent on the running alphas QCD in this specific process
+    static constexpr size_t ndcoup = Parameters_dependentCouplings::ndcoup;
+
+    // The number of floating point components of a complex number
+    static constexpr int nx2 = mgOnGpu::nx2;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      const int ipagC = ievt / neppC; // #event "C-page"
+      const int ieppC = ievt %% neppC; // #event in the current event C-page
+      constexpr int idcoup = 0;
+      constexpr int ix2 = 0;
+      return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer,
+                  const int ix2 )
+    {
+      constexpr int ipagC = 0;
+      constexpr int ieppC = 0;
+      // NB! the offset "idcoup * nx2 * neppC" has been added in idcoupAccessBuffer
+      constexpr int idcoup = 0;
+      return buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC]; // AOSOA[ipagC][idcoup][ix2][ieppC]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessCouplings : public MemoryAccessCouplingsBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessCouplingsBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessCouplingsBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
+    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessCouplingsBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
+    static constexpr auto decodeRecordIx2Const =
+      MemoryAccessHelper<MemoryAccessCouplingsBase>::template decodeRecordConst<int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
+    static constexpr auto ieventAccessIx2 =
+      MemoryAccessHelper<MemoryAccessCouplingsBase>::template ieventAccessField<int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
+    static constexpr auto ieventAccessIx2Const =
+      MemoryAccessHelper<MemoryAccessCouplingsBase>::template ieventAccessFieldConst<int>;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessCouplings
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessCouplingsBase
+    static constexpr auto idcoupAccessBuffer = MemoryAccessCouplingsBase::idcoupAccessBuffer;
+    static constexpr auto idcoupAccessBufferConst = MemoryAccessCouplingsBase::idcoupAccessBufferConst;
+
+    // Expose selected functions from MemoryAccessCouplings
+    static constexpr auto ieventAccessRecordConst = MemoryAccessCouplings::ieventAccessRecordConst;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
+    static constexpr auto kernelAccessIx2_s =
+      KernelAccessHelper<MemoryAccessCouplingsBase, onDevice>::template kernelAccessField<int>;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+    static constexpr auto kernelAccessIx2Const_s =
+      KernelAccessHelper<MemoryAccessCouplingsBase, onDevice>::template kernelAccessFieldConst<int>;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
+    static __host__ __device__ inline fptype_sv&
+    kernelAccessIx2( fptype* buffer,
+                     const int ix2 )
+    {
+      fptype& out = kernelAccessIx2_s( buffer, ix2 );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays
+      constexpr int neppC = MemoryAccessCouplingsBase::neppC;
+      static_assert( neppC >= neppV );                              // ASSUME CONTIGUOUS ARRAYS
+      static_assert( neppC %% neppV == 0 );                          // ASSUME CONTIGUOUS ARRAYS
+      static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) %% mgOnGpu::cppAlign == 0 );      // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+    static __host__ __device__ inline const fptype_sv&
+    kernelAccessIx2Const( const fptype* buffer,
+                          const int ix2 )
+    {
+      return kernelAccessIx2( const_cast<fptype*>( buffer ), ix2 );
+    }
+
+    /*
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+    static __host__ __device__ inline const fptype_sv&
+    kernelAccessIx2Const( const fptype* buffer,
+                          const int ix2 )
+    {
+      const fptype& out = kernelAccessIx2Const_s( buffer, ix2 );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays
+      constexpr int neppC = MemoryAccessCouplingsBase::neppC;
+      static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS
+      static_assert( neppC %% neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS
+      static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) %% mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+    */
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non const, SCALAR OR VECTOR) ===> cxtype_sv_ref kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline cxtype_sv_ref
+    kernelAccess( fptype* buffer )
+    {
+      /*
+      fptype_sv& real = kernelAccessIx2( buffer, 0 );
+      fptype_sv& imag = kernelAccessIx2( buffer, 1 );
+      printf( "C_ACCESS::kernelAccess: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag );
+      return cxtype_sv_ref( real, imag );
+      */
+      return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
+                            kernelAccessIx2( buffer, 1 ) );
+    }
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===]
+    static __host__ __device__ inline cxtype_sv
+    kernelAccessConst( const fptype* buffer )
+    {
+      /*
+      const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
+      const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
+      printf( "C_ACCESS::kernelAccessConst: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag );
+      return cxtype_sv( real, imag );
+      */
+      return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
+                        kernelAccessIx2Const( buffer, 1 ) );
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessCouplings<false> HostAccessCouplings;
+  typedef KernelAccessCouplings<true> DeviceAccessCouplings;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessCouplings_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
new file mode 100644
index 0000000000..d2ac450c4b
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h
@@ -0,0 +1,84 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Apr 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessCouplingsFixed_H
+#define MemoryAccessCouplingsFixed_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+#include "mgOnGpuVectors.h"
+
+//#include "MemoryAccessHelpers.h"
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for fixed couplings
+  // This implementation uses a STRUCT[ndcoup][nx2] "super-buffer" layout: in practice, the cIPC global array
+  // From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling
+  // [If many implementations are used, a suffix _Sv1 should be appended to the class name]
+  class MemoryAccessCouplingsFixedBase //_Sv1
+  {
+  public:
+
+    // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input)
+    // [Signature (const) ===> const fptype* iicoupAccessBufferConst( const fptype* buffer, const int iicoup ) <===]
+    static __host__ __device__ inline const fptype*
+    iicoupAccessBufferConst( const fptype* buffer, // input "super-buffer": in practice, the cIPC global array
+                             const int iicoup )
+    {
+      constexpr int ix2 = 0;
+      // NB! this effectively adds an offset "iicoup * nx2"
+      return &( buffer[iicoup * nx2 + ix2] ); // STRUCT[idcoup][ix2]
+    }
+
+  private:
+
+    // The number of floating point components of a complex number
+    static constexpr int nx2 = mgOnGpu::nx2;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessCouplingsFixed
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessCouplingsFixedBase
+    static constexpr auto iicoupAccessBufferConst = MemoryAccessCouplingsFixedBase::iicoupAccessBufferConst;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===]
+    static __host__ __device__ inline const cxtype_sv
+    kernelAccessConst( const fptype* buffer )
+    {
+      // TRIVIAL ACCESS to fixed-couplings buffers!
+      //return cxmake( fptype_sv{ buffer[0] }, fptype_sv{ buffer[1] } ); // NO! BUG #339!
+      const fptype_sv r_sv = fptype_sv{ 0 } + buffer[0];
+      const fptype_sv i_sv = fptype_sv{ 0 } + buffer[1];
+      return cxmake( r_sv, i_sv ); // ugly but effective
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessCouplingsFixed<false> HostAccessCouplingsFixed;
+  typedef KernelAccessCouplingsFixed<true> DeviceAccessCouplingsFixed;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessCouplingsFixed_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
new file mode 100644
index 0000000000..32f9be652d
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessDenominators_H
+#define MemoryAccessDenominators_H 1
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+
+#include "MemoryAccessGs.h"
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for denominators
+  // This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs
+
+  typedef KernelAccessGs<false> HostAccessDenominators;
+  typedef KernelAccessGs<true> DeviceAccessDenominators;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif
+#endif // MemoryAccessDenominators_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
new file mode 100644
index 0000000000..50a6aaef4d
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
@@ -0,0 +1,170 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessGs_H
+#define MemoryAccessGs_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for Gs
+  // This implementation uses a plain ARRAY[nevt]
+  // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+  class MemoryAccessGsBase //_ARRAYv1
+  {
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessGsBase>;
+    friend class KernelAccessHelper<MemoryAccessGsBase, true>;
+    friend class KernelAccessHelper<MemoryAccessGsBase, false>;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      return &( buffer[ievt] ); // ARRAY[nevt]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer )
+    {
+      constexpr int ievt = 0;
+      return buffer[ievt]; // ARRAY[nevt]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessGs : public MemoryAccessGsBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessGsBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessGsBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===]
+    static constexpr auto decodeRecord = MemoryAccessHelper<MemoryAccessGsBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===]
+    static constexpr auto decodeRecordConst =
+      MemoryAccessHelper<MemoryAccessGsBase>::template decodeRecordConst<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===]
+    static constexpr auto ieventAccess =
+      MemoryAccessHelper<MemoryAccessGsBase>::template ieventAccessField<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===]
+    static constexpr auto ieventAccessConst =
+      MemoryAccessHelper<MemoryAccessGsBase>::template ieventAccessFieldConst<>;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessGs
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessGs
+    static constexpr auto ieventAccessRecord = MemoryAccessGs::ieventAccessRecord;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const, SCALAR) ===> fptype& kernelAccess( fptype* buffer ) <===]
+    static constexpr auto kernelAccess_s =
+      KernelAccessHelper<MemoryAccessGsBase, onDevice>::template kernelAccessField<>; // requires cuda 11.4
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (non-const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv&
+    kernelAccess( fptype* buffer )
+    {
+      fptype& out = kernelAccess_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+      static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv*
+    kernelAccessP( fptype* buffer )
+    {
+      return reinterpret_cast<fptype_sv*>( buffer );
+    }
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+    static constexpr auto kernelAccessConst_s =
+      KernelAccessHelper<MemoryAccessGsBase, onDevice>::template kernelAccessFieldConst<>; // requires cuda 11.4
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccess( const fptype* buffer ) <===]
+    static __host__ __device__ inline const fptype_sv&
+    kernelAccessConst( const fptype* buffer )
+    {
+      const fptype& out = kernelAccessConst_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+      static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessGs<false> HostAccessGs;
+  typedef KernelAccessGs<true> DeviceAccessGs;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessGs_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
new file mode 100644
index 0000000000..12800d8f51
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
@@ -0,0 +1,157 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessHelpers_H
+#define MemoryAccessHelpers_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuFptypes.h"
+
+//----------------------------------------------------------------------------
+
+// A templated helper class that includes the boilerplate code for MemoryAccess classes
+template<class T, typename FT = fptype>
+class MemoryAccessHelper
+{
+public:
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = T::ieventAccessRecord;
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline const FT*
+  ieventAccessRecordConst( const FT* buffer,
+                           const int ievt )
+  {
+    return ieventAccessRecord( const_cast<FT*>( buffer ), ievt );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  static constexpr auto decodeRecord = T::decodeRecord;
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline const FT&
+  decodeRecordConst( const FT* buffer,
+                     Ts... args ) // variadic template
+  {
+    return T::decodeRecord( const_cast<FT*>( buffer ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessField( fptype* buffer, const ievt, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline FT&
+  ieventAccessField( FT* buffer,
+                     const int ievt,
+                     Ts... args ) // variadic template
+  {
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    return T::decodeRecord( T::ieventAccessRecord( buffer, ievt ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessFieldConst( const fptype* buffer, const ievt, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline const FT&
+  ieventAccessFieldConst( const FT* buffer,
+                          const int ievt,
+                          Ts... args ) // variadic template
+  {
+    return ieventAccessField( const_cast<FT*>( buffer ), ievt, args... );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A templated helper class that includes the boilerplate code for KernelAccess classes
+template<class T, bool onDevice, typename FT = fptype>
+class KernelAccessHelper : public MemoryAccessHelper<T, FT>
+{
+public:
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (non-const) ===> fptype* kernelAccessRecord( fptype* buffer ) <===]
+  static __host__ __device__ inline FT*
+  kernelAccessRecord( FT* buffer )
+  {
+    if constexpr( !onDevice ) // requires c++17 also in CUDA (#333)
+    {
+      // FIXME #436: clarify that buffer includes all events on device, and only the record for an event subset on host!
+      // FIXME #436: am I not assuming that the following line is always identical to buffer for all access classes T?
+      return T::ieventAccessRecord( buffer, 0 );
+    }
+    else
+    {
+#ifdef MGONGPUCPP_GPUIMPL
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
+      return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
+#else
+      throw std::runtime_error( "kernelAccessRecord on device is only implemented in CUDA" );
+#endif
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (const) ===> const fptype* kernelAccessRecordConst( const fptype* buffer ) <===]
+  static __host__ __device__ inline const FT*
+  kernelAccessRecordConst( const FT* buffer )
+  {
+    return kernelAccessRecord( const_cast<FT*>( buffer ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccessField( fptype* buffer, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline FT&
+  kernelAccessField( FT* buffer,
+                     Ts... args ) // variadic template
+  {
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    return T::decodeRecord( kernelAccessRecord( buffer ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessFieldConst( const fptype* buffer, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline const FT&
+  kernelAccessFieldConst( const FT* buffer,
+                          Ts... args ) // variadic template
+  {
+    return kernelAccessField( const_cast<FT*>( buffer ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+};
+
+#endif // MemoryAccessHelpers_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessIflavorVec.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessIflavorVec.h
new file mode 100644
index 0000000000..954c44b03f
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessIflavorVec.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2020-2026 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: D. Massaro (Jan 2026, based on earlier work by A. Valassi) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessIflavorVec_H
+#define MemoryAccessIflavorVec_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL // fix #893 (not __CUDACC__)
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for channel ids
+  // This implementation uses a plain ARRAY[nevt]
+  // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+  class MemoryAccessIflavorVecBase //_ARRAYv1
+  {
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessIflavorVecBase, unsigned int>;
+    friend class KernelAccessHelper<MemoryAccessIflavorVecBase, true, unsigned int>;
+    friend class KernelAccessHelper<MemoryAccessIflavorVecBase, false, unsigned int>;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> unsigned int* ieventAccessRecord( unsigned int* buffer, const int ievt ) <===]
+    static __host__ __device__ inline unsigned int*
+    ieventAccessRecord( unsigned int* buffer,
+                        const int ievt )
+    {
+      return &( buffer[ievt] ); // ARRAY[nevt]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> unsigned int& decodeRecord( unsigned int* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+    static __host__ __device__ inline unsigned int&
+    decodeRecord( unsigned int* buffer )
+    {
+      constexpr int ievt = 0;
+      return buffer[ievt]; // ARRAY[nevt]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessIflavorVec : public MemoryAccessIflavorVecBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const unsigned int* ieventAccessRecordConst( const unsigned int* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessIflavorVecBase, unsigned int>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const unsigned int& decodeRecordConst( const unsigned int* buffer ) <===]
+    static constexpr auto decodeRecordConst = MemoryAccessHelper<MemoryAccessIflavorVecBase, unsigned int>::template decodeRecordConst<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const unsigned int& ieventAccessConst( const unsigned int* buffer, const ievt ) <===]
+    static constexpr auto ieventAccessConst = MemoryAccessHelper<MemoryAccessIflavorVecBase, unsigned int>::template ieventAccessFieldConst<>;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessIflavorVec
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessIflavorVec
+    static constexpr auto ieventAccessRecordConst = MemoryAccessIflavorVec::ieventAccessRecordConst;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR) ===> const unsigned int& kernelAccessConst( const unsigned int* buffer ) <===]
+    static constexpr auto kernelAccessConst_s = KernelAccessHelper<MemoryAccessIflavorVecBase, onDevice, unsigned int>::template kernelAccessFieldConst<>; // requires cuda 11.4
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (const, SCALAR OR VECTOR) ===> const uint_sv& kernelAccess( const unsigned int* buffer ) <===]
+    static __host__ __device__ inline const uint_sv&
+    kernelAccessConst( const unsigned int* buffer )
+    {
+      const unsigned int& out = kernelAccessConst_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+      static_assert( mg5amcCpu::HostBufferIflavorVec::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::uintvFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessIflavorVec<false> HostAccessIflavorVec;
+  typedef KernelAccessIflavorVec<true> DeviceAccessIflavorVec;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessIflavorVec_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
new file mode 100644
index 0000000000..c39a9cdf67
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h
@@ -0,0 +1,146 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessMatrixElements_H
+#define MemoryAccessMatrixElements_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for matrix elements
+  // This implementation uses a plain ARRAY[nevt]
+  // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+  class MemoryAccessMatrixElementsBase //_ARRAYv1
+  {
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessMatrixElementsBase>;
+    friend class KernelAccessHelper<MemoryAccessMatrixElementsBase, true>;
+    friend class KernelAccessHelper<MemoryAccessMatrixElementsBase, false>;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      return &( buffer[ievt] ); // ARRAY[nevt]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer )
+    {
+      constexpr int ievt = 0;
+      return buffer[ievt]; // ARRAY[nevt]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessMatrixElements : public MemoryAccessMatrixElementsBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessMatrixElementsBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessMatrixElementsBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===]
+    static constexpr auto decodeRecord = MemoryAccessHelper<MemoryAccessMatrixElementsBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===]
+    static constexpr auto decodeRecordConst =
+      MemoryAccessHelper<MemoryAccessMatrixElementsBase>::template decodeRecordConst<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===]
+    static constexpr auto ieventAccess =
+      MemoryAccessHelper<MemoryAccessMatrixElementsBase>::template ieventAccessField<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===]
+    static constexpr auto ieventAccessConst =
+      MemoryAccessHelper<MemoryAccessMatrixElementsBase>::template ieventAccessFieldConst<>;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessMatrixElements
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessMatrixElements
+    static constexpr auto ieventAccessRecord = MemoryAccessMatrixElements::ieventAccessRecord;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const, SCALAR) ===> fptype& kernelAccess_s( fptype* buffer ) <===]
+    static constexpr auto kernelAccess_s =
+      KernelAccessHelper<MemoryAccessMatrixElementsBase, onDevice>::template kernelAccessField<>; // requires cuda 11.4
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+    // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( const fptype* buffer ) <===]
+    static __host__ __device__ inline fptype_sv&
+    kernelAccess( fptype* buffer )
+    {
+      fptype& out = kernelAccess_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+      static_assert( mg5amcCpu::HostBufferMatrixElements::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+      return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+    }
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+    static constexpr auto kernelAccessConst =
+      KernelAccessHelper<MemoryAccessMatrixElementsBase, onDevice>::template kernelAccessFieldConst<>; // requires cuda 11.4
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessMatrixElements<false> HostAccessMatrixElements;
+  typedef KernelAccessMatrixElements<true> DeviceAccessMatrixElements;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessMatrixElements_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
new file mode 100644
index 0000000000..1bba0f5e80
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -0,0 +1,275 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessMomenta_H
+#define MemoryAccessMomenta_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for momenta
+  // This implementation uses an AOSOA[npagM][npar][np4][neppM] where nevt=npagM*neppM
+  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+  class MemoryAccessMomentaBase //_AOSOAv1
+  {
+  public:
+
+    // Number of Events Per Page in the momenta AOSOA memory buffer layout
+    // (these are all best kept as a compile-time constants: see issue #23)
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+    // -----------------------------------------------------------------------------------------------
+    // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
+    // --- This is relevant to ensure coalesced access to momenta in global memory
+    // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
+    // -----------------------------------------------------------------------------------------------
+    //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+    static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+    //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
+#else
+    // -----------------------------------------------------------------------------------------------
+    // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register
+    // --- This is relevant to ensure faster access to momenta from C++ memory cache lines
+    // --- However, neppM is now decoupled from neppV (issue #176) and can be separately hardcoded
+    // --- In practice, neppR, neppM and neppV could now (in principle) all be different
+    // -----------------------------------------------------------------------------------------------
+#ifdef MGONGPU_CPPSIMD
+    static constexpr int neppM = MGONGPU_CPPSIMD; // (DEFAULT) neppM=neppV for optimal performance
+    //static constexpr int neppM = 64/sizeof(fptype); // maximum CPU vector width (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+    //static constexpr int neppM = 32/sizeof(fptype); // lower CPU vector width (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+    //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 4.66E6 instead of 5.09E9 in eemumu)
+    //static constexpr int neppM = MGONGPU_CPPSIMD*2; // FOR TESTS
+#else
+    static constexpr int neppM = 1; // (DEFAULT) neppM=neppV for optimal performance (NB: this is equivalent to AOS)
+#endif
+#endif /* clang-format on */
+
+    // SANITY CHECK: check that neppM is a power of two
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessMomentaBase>;
+    friend class KernelAccessHelper<MemoryAccessMomentaBase, true>;
+    friend class KernelAccessHelper<MemoryAccessMomentaBase, false>;
+
+    // The number of components of a 4-momentum
+    static constexpr int np4 = CPPProcess::np4;
+
+    // The number of particles in this physics process
+    static constexpr int npar = CPPProcess::npar;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      const int ipagM = ievt / neppM; // #event "M-page"
+      const int ieppM = ievt % neppM; // #event in the current event M-page
+      constexpr int ip4 = 0;
+      constexpr int ipar = 0;
+      return &( buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM] ); // AOSOA[ipagM][ipar][ip4][ieppM]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to "const int ip4, const int ipar" and rename "Field" as "Ip4Ipar"]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer,
+                  const int ip4,
+                  const int ipar )
+    {
+      constexpr int ipagM = 0;
+      constexpr int ieppM = 0;
+      return buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM]; // AOSOA[ipagM][ipar][ip4][ieppM]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessMomenta : public MemoryAccessMomentaBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessMomentaBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessMomentaBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int ipar ) <===]
+    static constexpr auto decodeRecordIp4Ipar = MemoryAccessHelper<MemoryAccessMomentaBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+    static constexpr auto decodeRecordIp4IparConst =
+      MemoryAccessHelper<MemoryAccessMomentaBase>::template decodeRecordConst<int, int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccessIp4Ipar( fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+    static constexpr auto ieventAccessIp4Ipar =
+      MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessField<int, int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+    // DEFAULT VERSION
+    static constexpr auto ieventAccessIp4IparConst =
+      MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessFieldConst<int, int>;
+
+    /*
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+    // DEBUG VERSION WITH PRINTOUTS
+    static __host__ __device__ inline const fptype&
+    ieventAccessIp4IparConst( const fptype* buffer,
+                                            const int ievt,
+                                            const int ip4,
+                                            const int ipar )
+    {
+      const fptype& out = MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessFieldConst<int, int>( buffer, ievt, ip4, ipar );
+      printf( "ipar=%2d ip4=%2d ievt=%8d out=%8.3f\n", ipar, ip4, ievt, out );
+      return out;
+    }
+    */
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessMomenta
+  {
+  public:
+
+    // Expose selected functions from MemoryAccessMomenta
+    static constexpr auto ieventAccessRecordConst = MemoryAccessMomenta::ieventAccessRecordConst;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIp4Ipar( fptype* buffer, const int ipar, const int ipar ) <===]
+    static constexpr auto kernelAccessIp4Ipar =
+      KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessField<int, int>;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+    // DEFAULT VERSION
+    static constexpr auto kernelAccessIp4IparConst_s =
+      KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessFieldConst<int, int>;
+
+    /*
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+    // DEBUG VERSION WITH PRINTOUTS
+    static __host__ __device__ inline const fptype&
+    kernelAccessIp4IparConst_s( const fptype* buffer,
+                                const int ip4,
+                                const int ipar )
+    {
+      const fptype& out = KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessFieldConst<int, int>( buffer, ip4, ipar );
+      printf( "ipar=%2d ip4=%2d ievt='kernel' out=%8.3f\n", ipar, ip4, out );
+      return out;
+    }
+    */
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const, SCALAR OR VECTOR) ===> fptype_sv kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+    // FIXME? Eventually return by const reference and support aligned arrays only?
+    // FIXME? Currently return by value to support also unaligned and arbitrary arrays
+    static __host__ __device__ inline fptype_sv
+    kernelAccessIp4IparConst( const fptype* buffer,
+                              const int ip4,
+                              const int ipar )
+    {
+      const fptype& out = kernelAccessIp4IparConst_s( buffer, ip4, ipar );
+#ifndef MGONGPU_CPPSIMD
+      return out;
+#else
+      constexpr int neppM = MemoryAccessMomentaBase::neppM;
+      constexpr bool useContiguousEventsIfPossible = true; // DEFAULT
+      //constexpr bool useContiguousEventsIfPossible = false; // FOR PERFORMANCE TESTS (treat as arbitrary array even if it is an AOSOA)
+      // Use c++17 "if constexpr": compile-time branching
+      if constexpr( useContiguousEventsIfPossible && ( neppM >= neppV ) && ( neppM % neppV == 0 ) )
+      {
+        //constexpr bool skipAlignmentCheck = true; // FASTEST (SEGFAULTS IF MISALIGNED ACCESS, NEEDS A SANITY CHECK ELSEWHERE!)
+        constexpr bool skipAlignmentCheck = false; // DEFAULT: A BIT SLOWER BUT SAFER [ALLOWS MISALIGNED ACCESS]
+        if constexpr( skipAlignmentCheck )
+        {
+          //static bool first=true; if( first ){ std::cout << "WARNING! assume aligned AOSOA, skip check" << std::endl; first=false; } // SLOWER (5.06E6)
+          // FASTEST? (5.09E6 in eemumu 512y)
+          // This assumes alignment for momenta1d without checking - causes segmentation fault in reinterpret_cast if not aligned!
+          return mg5amcCpu::fptypevFromAlignedArray( out ); // use reinterpret_cast
+        }
+        else if( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 )
+        {
+          //static bool first=true; if( first ){ std::cout << "WARNING! aligned AOSOA, reinterpret cast" << std::endl; first=false; } // SLOWER (5.00E6)
+          // DEFAULT! A tiny bit (<1%) slower because of the alignment check (5.07E6 in eemumu 512y)
+          // This explicitly checks buffer alignment to avoid segmentation faults in reinterpret_cast
+          return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+        }
+        else
+        {
+          //static bool first=true; if( first ){ std::cout << "WARNING! AOSOA but no reinterpret cast" << std::endl; first=false; } // SLOWER (4.93E6)
+          // A bit (1%) slower (5.05E6 in eemumu 512y)
+          // This does not require buffer alignment, but it requires AOSOA with neppM>=neppV and neppM%neppV==0
+          return mg5amcCpu::fptypevFromUnalignedArray( out ); // SIMD bulk load of neppV, do not use reinterpret_cast (fewer SIMD operations)
+        }
+      }
+      else
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! arbitrary array" << std::endl; first=false; } // SLOWER (5.08E6)
+        // ?!Used to be much slower, now a tiny bit faster for AOSOA?! (5.11E6 for AOSOA, 4.64E6 for AOS in eemumu 512y)
+        // This does not even require AOSOA with neppM>=neppV and neppM%neppV==0 (e.g. can be used with AOS neppM==1)
+        constexpr int ievt0 = 0; // just make it explicit in the code that buffer refers to a given ievt0 and decoderIeppV fetches event ievt0+ieppV
+        auto decoderIeppv = [buffer, ip4, ipar]( int ieppV )
+          -> const fptype&
+        { return MemoryAccessMomenta::ieventAccessIp4IparConst( buffer, ievt0 + ieppV, ip4, ipar ); };
+        return mg5amcCpu::fptypevFromArbitraryArray( decoderIeppv ); // iterate over ieppV in neppV (no SIMD)
+      }
+#endif
+    }
+
+    // Is this a HostAccess or DeviceAccess class?
+    // [this is only needed for a warning printout in rambo.h for nparf==1 #358]
+    static __host__ __device__ inline constexpr bool
+    isOnDevice()
+    {
+      return onDevice;
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessMomenta<false> HostAccessMomenta;
+  typedef KernelAccessMomenta<true> DeviceAccessMomenta;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessMomenta_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
new file mode 100644
index 0000000000..298007e9b9
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessNumerators_H
+#define MemoryAccessNumerators_H 1
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+
+#include "MemoryAccessGs.h"
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for numerators
+  // This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs
+
+  typedef KernelAccessGs<false> HostAccessNumerators;
+  typedef KernelAccessGs<true> DeviceAccessNumerators;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif
+#endif // MemoryAccessNumerators_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
new file mode 100644
index 0000000000..e3eda115a8
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h
@@ -0,0 +1,144 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessRandomNumbers_H
+#define MemoryAccessRandomNumbers_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "MemoryAccessHelpers.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+using mg5amcGpu::CPPProcess;
+#else
+using mg5amcCpu::CPPProcess;
+#endif
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for random numbers
+// This implementation uses an AOSOA[npagR][nparf][np4][neppR] where nevt=npagR*neppR
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessRandomNumbersBase //_AOSOAv1
+{
+public: /* clang-format off */
+
+  // Number of Events Per Page in the random number AOSOA memory buffer layout
+  // *** NB Different values of neppR lead to different physics results: the ***
+  // *** same 1d array is generated, but it is interpreted in different ways ***
+  static constexpr int neppR = 8; // HARDCODED TO GIVE ALWAYS THE SAME PHYSICS RESULTS!
+  //static constexpr int neppR = 1; // AOS (tests of sectors/requests)
+
+private: /* clang-format on */
+
+  friend class MemoryAccessHelper<MemoryAccessRandomNumbersBase>;
+  friend class KernelAccessHelper<MemoryAccessRandomNumbersBase, true>;
+  friend class KernelAccessHelper<MemoryAccessRandomNumbersBase, false>;
+
+  // The number of components of a 4-momentum
+  static constexpr int np4 = CPPProcess::np4;
+
+  // The number of final state particles in this physics process
+  static constexpr int nparf = CPPProcess::nparf;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagR = ievt / neppR; // #event "R-page"
+    const int ieppR = ievt % neppR; // #event in the current event R-page
+    constexpr int ip4 = 0;
+    constexpr int iparf = 0;
+    return &( buffer[ipagR * nparf * np4 * neppR + iparf * np4 * neppR + ip4 * neppR + ieppR] ); // AOSOA[ipagR][iparf][ip4][ieppR]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int ip4, const int iparf" and rename "Field" as "Ip4Iparf"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int ip4,
+                const int iparf )
+  {
+    constexpr int ipagR = 0;
+    constexpr int ieppR = 0;
+    return buffer[ipagR * nparf * np4 * neppR + iparf * np4 * neppR + ip4 * neppR + ieppR]; // AOSOA[ipagR][iparf][ip4][ieppR]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessRandomNumbers : public MemoryAccessRandomNumbersBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessRandomNumbersBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessRandomNumbersBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto decodeRecordIp4Iparf = MemoryAccessHelper<MemoryAccessRandomNumbersBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto decodeRecordIp4IparfConst =
+    MemoryAccessHelper<MemoryAccessRandomNumbersBase>::template decodeRecordConst<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIp4Iparf( fptype* buffer, const ievt, const int ipar, const int iparf ) <===]
+  static constexpr auto ieventAccessIp4Iparf =
+    MemoryAccessHelper<MemoryAccessRandomNumbersBase>::template ieventAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIp4IparfConst( const fptype* buffer, const ievt, const int ipar, const int iparf ) <===]
+  static constexpr auto ieventAccessIp4IparfConst =
+    MemoryAccessHelper<MemoryAccessRandomNumbersBase>::template ieventAccessFieldConst<int, int>;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessRandomNumbers
+{
+public:
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccessIp4Iparf( fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto kernelAccessIp4Iparf =
+    KernelAccessHelper<MemoryAccessRandomNumbersBase, onDevice>::template kernelAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessIp4IparfConst( const fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto kernelAccessIp4IparfConst =
+    KernelAccessHelper<MemoryAccessRandomNumbersBase, onDevice>::template kernelAccessFieldConst<int, int>;
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessRandomNumbers<false> HostAccessRandomNumbers;
+typedef KernelAccessRandomNumbers<true> DeviceAccessRandomNumbers;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessRandomNumbers_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
new file mode 100644
index 0000000000..6f6623aafc
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h
@@ -0,0 +1,137 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessVectors_H
+#define MemoryAccessVectors_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#ifndef MGONGPUCPP_GPUIMPL
+namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
+{
+
+#ifdef MGONGPU_CPPSIMD
+  //--------------------------------------------------------------------------
+
+  // Cast one non-const fptype_v reference (one vector of neppV fptype values) from one non-const fptype reference (#435),
+  // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", and that the arrays are aligned
+  inline fptype_v& fptypevFromAlignedArray( fptype& ref )
+  {
+    return *reinterpret_cast<fptype_sv*>( &ref );
+  }
+
+  inline uint_v& uintvFromAlignedArray( unsigned int& ref )
+  {
+    return *reinterpret_cast<uint_sv*>( &ref );
+  }
+
+  // Cast one const fptype_v reference (one vector of neppV fptype values) from one const fptype reference,
+  // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", and that the arrays are aligned
+  inline const fptype_v& fptypevFromAlignedArray( const fptype& ref )
+  {
+    return *reinterpret_cast<const fptype_sv*>( &ref );
+  }
+
+  inline const uint_v& uintvFromAlignedArray( const unsigned int& ref )
+  {
+    return *reinterpret_cast<const uint_sv*>( &ref );
+  }
+
+  // Build one fptype_v (one vector of neppV fptype values) from one fptype reference,
+  // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", but that the arrays are not aligned
+  inline fptype_v fptypevFromUnalignedArray( const fptype& ref )
+  {
+#if MGONGPU_CPPSIMD == 2
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (2)
+                     *( &ref + 1 ) };
+#elif MGONGPU_CPPSIMD == 4
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (4)
+                     *( &ref + 1 ),
+                     *( &ref + 2 ),
+                     *( &ref + 3 ) };
+#elif MGONGPU_CPPSIMD == 8
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (8)
+                     *( &ref + 1 ),
+                     *( &ref + 2 ),
+                     *( &ref + 3 ),
+                     *( &ref + 4 ),
+                     *( &ref + 5 ),
+                     *( &ref + 6 ),
+                     *( &ref + 7 ) };
+#elif MGONGPU_CPPSIMD == 16
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (16)
+                     *( &ref + 1 ),
+                     *( &ref + 2 ),
+                     *( &ref + 3 ),
+                     *( &ref + 4 ),
+                     *( &ref + 5 ),
+                     *( &ref + 6 ),
+                     *( &ref + 7 ),
+                     *( &ref + 8 ),
+                     *( &ref + 9 ),
+                     *( &ref + 10 ),
+                     *( &ref + 11 ),
+                     *( &ref + 12 ),
+                     *( &ref + 13 ),
+                     *( &ref + 14 ),
+                     *( &ref + 15 ) };
+#else
+#error Internal error! Unknown MGONGPU_CPPSIMD value
+#endif
+  }
+
+  // Build one fptype_v (one vector of neppV fptype values) from one fptype reference,
+  // with no a priori assumption on how the input fptype array should be decoded
+  template<typename Functor>
+  inline fptype_v fptypevFromArbitraryArray( Functor decoderIeppv )
+  {
+#if MGONGPU_CPPSIMD == 2
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (2)
+                     decoderIeppv( 1 ) };
+#elif MGONGPU_CPPSIMD == 4
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (4)
+                     decoderIeppv( 1 ),
+                     decoderIeppv( 2 ),
+                     decoderIeppv( 3 ) };
+#elif MGONGPU_CPPSIMD == 8
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (8)
+                     decoderIeppv( 1 ),
+                     decoderIeppv( 2 ),
+                     decoderIeppv( 3 ),
+                     decoderIeppv( 4 ),
+                     decoderIeppv( 5 ),
+                     decoderIeppv( 6 ),
+                     decoderIeppv( 7 ) };
+#elif MGONGPU_CPPSIMD == 16
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (16)
+                     decoderIeppv( 1 ),
+                     decoderIeppv( 2 ),
+                     decoderIeppv( 3 ),
+                     decoderIeppv( 4 ),
+                     decoderIeppv( 5 ),
+                     decoderIeppv( 6 ),
+                     decoderIeppv( 7 ),
+                     decoderIeppv( 8 ),
+                     decoderIeppv( 9 ),
+                     decoderIeppv( 10 ),
+                     decoderIeppv( 11 ),
+                     decoderIeppv( 12 ),
+                     decoderIeppv( 13 ),
+                     decoderIeppv( 14 ),
+                     decoderIeppv( 15 ) };
+#else
+#error Internal error! Unknown MGONGPU_CPPSIMD value
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+#endif
+
+} // end namespace
+#endif
+
+#endif // MemoryAccessVectors_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
new file mode 100644
index 0000000000..9f4c620bc7
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
@@ -0,0 +1,169 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessWavefunctions_H
+#define MemoryAccessWavefunctions_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "MemoryAccessHelpers.h"
+
+#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+
+  // A class describing the internal layout of memory buffers for wavefunctions
+  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
+  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+  class MemoryAccessWavefunctionsBase //_AOSOAv1
+  {
+  public:
+
+    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
+    static constexpr int neppW = 1; // AOS (just a test...)
+
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
+    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
+    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
+
+    // The number of components of a (fermion or vector) wavefunction
+    static constexpr int nw6 = mgOnGpu::nw6;
+
+    // The number of floating point components of a complex number
+    static constexpr int nx2 = mgOnGpu::nx2;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      const int ipagW = ievt / neppW; // #event "W-page"
+      const int ieppW = ievt % neppW; // #event in the current event W-page
+      constexpr int iw6 = 0;
+      constexpr int ix2 = 0;
+      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer,
+                  const int iw6,
+                  const int ix2 )
+    {
+      constexpr int ipagW = 0;
+      constexpr int ieppW = 0;
+      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
+    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
+    static constexpr auto decodeRecordIw6Ix2Const =
+      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
+    static constexpr auto ieventAccessIw6Ix2 =
+      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
+    static constexpr auto ieventAccessIw6Ix2Const =
+      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
+  };
+
+#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessWavefunctions
+  {
+  public:
+
+#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
+    static constexpr auto kernelAccessIw6Ix2 =
+      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
+    static constexpr auto kernelAccessIw6Ix2Const =
+      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
+
+#else
+
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer );
+    }
+
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
+    {
+      return reinterpret_cast<const cxtype_sv*>( buffer );
+    }
+
+#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
+  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessWavefunctions_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWeights.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWeights.h
new file mode 100644
index 0000000000..df3c568b12
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWeights.h
@@ -0,0 +1,149 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryAccessWeights_H
+#define MemoryAccessWeights_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //----------------------------------------------------------------------------
+
+  // A class describing the internal layout of memory buffers for weights
+  // This implementation uses a plain ARRAY[nevt]
+  // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+  class MemoryAccessWeightsBase //_ARRAYv1
+  {
+  private:
+
+    friend class MemoryAccessHelper<MemoryAccessWeightsBase>;
+    friend class KernelAccessHelper<MemoryAccessWeightsBase, true>;
+    friend class KernelAccessHelper<MemoryAccessWeightsBase, false>;
+
+    //--------------------------------------------------------------------------
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    //--------------------------------------------------------------------------
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static __host__ __device__ inline fptype*
+    ieventAccessRecord( fptype* buffer,
+                        const int ievt )
+    {
+      return &( buffer[ievt] ); // ARRAY[nevt]
+    }
+
+    //--------------------------------------------------------------------------
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+    // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+    static __host__ __device__ inline fptype&
+    decodeRecord( fptype* buffer )
+    {
+      constexpr int ievt = 0;
+      return buffer[ievt]; // ARRAY[nevt]
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on explicit event numbers
+  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+  class MemoryAccessWeights : public MemoryAccessWeightsBase
+  {
+  public:
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWeightsBase>::ieventAccessRecord;
+
+    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWeightsBase>::ieventAccessRecordConst;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===]
+    static constexpr auto decodeRecord = MemoryAccessHelper<MemoryAccessWeightsBase>::decodeRecord;
+
+    // Locate a field (output) of an event record (input) from the given field indexes (input)
+    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===]
+    static constexpr auto decodeRecordConst =
+      MemoryAccessHelper<MemoryAccessWeightsBase>::template decodeRecordConst<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===]
+    static constexpr auto ieventAccess =
+      MemoryAccessHelper<MemoryAccessWeightsBase>::template ieventAccessField<>;
+
+    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===]
+    static constexpr auto ieventAccessConst =
+      MemoryAccessHelper<MemoryAccessWeightsBase>::template ieventAccessFieldConst<>;
+  };
+
+  //----------------------------------------------------------------------------
+
+  // A class providing access to memory buffers for a given event, based on implicit kernel rules
+  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  template<bool onDevice>
+  class KernelAccessWeights
+  {
+  public:
+
+    /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccess( fptype* buffer ) <===]
+  // FINAL IMPLEMENTATION FOR CUDA 11.4
+  static constexpr auto kernelAccess =
+    KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessField<>;
+    */
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (non-const) ===> fptype& kernelAccess( fptype* buffer ) <===]
+    // TEMPORARY HACK FOR CUDA 11.1
+    static __host__ __device__ inline fptype&
+    kernelAccess( fptype* buffer )
+    {
+      return KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessField<>( buffer );
+    }
+
+    /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+  // FINAL IMPLEMENTATION FOR CUDA 11.4
+  static constexpr auto kernelAccessConst =
+    KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessFieldConst<>;
+    */
+
+    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+    // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+    // TEMPORARY HACK FOR CUDA 11.1
+    static __host__ __device__ inline const fptype&
+    kernelAccessConst( const fptype* buffer )
+    {
+      return KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessFieldConst<>( buffer );
+    }
+  };
+
+  //----------------------------------------------------------------------------
+
+  typedef KernelAccessWeights<false> HostAccessWeights;
+  typedef KernelAccessWeights<true> DeviceAccessWeights;
+
+  //----------------------------------------------------------------------------
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MemoryAccessWeights_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
new file mode 100644
index 0000000000..8b45069832
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -0,0 +1,606 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MemoryBuffers_H
+#define MemoryBuffers_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "Parameters.h"
+#include "processConfig.h"
+
+#include <sstream>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  namespace MemoryBuffers
+  {
+    // Process-independent compile-time constants
+    static constexpr size_t np4 = CPPProcess::np4;
+    static constexpr size_t nw6 = CPPProcess::nw6;
+    static constexpr size_t nx2 = mgOnGpu::nx2;
+    // Process-dependent compile-time constants
+    static constexpr size_t nparf = CPPProcess::nparf;
+    static constexpr size_t npar = CPPProcess::npar;
+    static constexpr size_t ndcoup = Parameters_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // An abstract interface encapsulating a given number of events
+  class INumberOfEvents
+  {
+  public:
+    virtual ~INumberOfEvents() {}
+    virtual size_t nevt() const = 0;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating a given number of events
+  class NumberOfEvents : virtual public INumberOfEvents
+  {
+  public:
+    NumberOfEvents( const size_t nevt )
+      : m_nevt( nevt ) {}
+    virtual ~NumberOfEvents() {}
+    virtual size_t nevt() const override { return m_nevt; }
+  private:
+    const size_t m_nevt;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer (not necessarily an event buffer)
+  template<typename T>
+  class BufferBase : virtual public INumberOfEvents
+  {
+  protected:
+    BufferBase( const size_t size, const bool onDevice )
+      : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
+  public:
+    virtual ~BufferBase() {}
+    T* data() { return m_data; }
+    const T* data() const { return m_data; }
+    T& operator[]( const size_t index ) { return m_data[index]; }
+    const T& operator[]( const size_t index ) const { return m_data[index]; }
+    size_t size() const { return m_size; }
+    size_t bytes() const { return m_size * sizeof( T ); }
+    bool isOnDevice() const { return m_isOnDevice; }
+    virtual size_t nevt() const override { throw std::runtime_error( "This BufferBase is not an event buffer" ); }
+  protected:
+    const size_t m_size;
+    T* m_data;
+    const bool m_isOnDevice;
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
+  constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
+
+  // A class encapsulating a C++ host buffer
+  template<typename T, bool ismisaligned>
+  class HostBufferBase : public BufferBase<T>
+  {
+  public:
+    HostBufferBase( const size_t size )
+      : BufferBase<T>( size, false )
+    {
+      if constexpr( !ismisaligned )
+        this->m_data = new( std::align_val_t( cppAlign ) ) T[size]();
+      else
+        this->m_data = new( std::align_val_t( cppAlign ) ) T[size + 1]() + 1; // TEST MISALIGNMENT!
+    }
+    virtual ~HostBufferBase()
+    {
+      if constexpr( !ismisaligned )
+        ::operator delete[]( this->m_data, std::align_val_t( cppAlign ) );
+      else
+        ::operator delete[]( ( this->m_data ) - 1, std::align_val_t( cppAlign ) ); // TEST MISALIGNMENT!
+    }
+    static constexpr bool isaligned() { return !ismisaligned; }
+  public:
+    static constexpr size_t cppAlign = mgOnGpu::cppAlign;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a CUDA pinned host buffer
+  template<typename T>
+  class PinnedHostBufferBase : public BufferBase<T>
+  {
+  public:
+    PinnedHostBufferBase( const size_t size )
+      : BufferBase<T>( size, false )
+    {
+      gpuMallocHost( &( this->m_data ), this->bytes() );
+    }
+    virtual ~PinnedHostBufferBase()
+    {
+      gpuFreeHost( this->m_data );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a CUDA device buffer
+  template<typename T>
+  class DeviceBufferBase : public BufferBase<T>
+  {
+  public:
+    DeviceBufferBase( const size_t size )
+      : BufferBase<T>( size, true )
+    {
+      gpuMalloc( &( this->m_data ), this->bytes() );
+    }
+    virtual ~DeviceBufferBase()
+    {
+      gpuFree( this->m_data );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for a given number of events
+  template<typename T, size_t sizePerEvent, bool ismisaligned>
+  class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
+  {
+  public:
+    HostBuffer( const size_t nevt )
+      : NumberOfEvents( nevt )
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
+    virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a CUDA pinned host buffer for a given number of events
+  template<typename T, size_t sizePerEvent>
+  class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
+  {
+  public:
+    PinnedHostBuffer( const size_t nevt )
+      : NumberOfEvents( nevt )
+      , PinnedHostBufferBase<T>( sizePerEvent * nevt ) {}
+    virtual ~PinnedHostBuffer() {}
+    virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a CUDA device buffer for a given number of events
+  template<typename T, size_t sizePerEvent>
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
+  {
+  public:
+    DeviceBuffer( const size_t nevt )
+      : NumberOfEvents( nevt )
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
+    virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for momenta random numbers
+  typedef BufferBase<fptype> BufferRndNumMomenta;
+
+  // The size (number of elements) per event in a memory buffer for momenta random numbers
+  constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for momenta random numbers
+  typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
+#else
+  // A class encapsulating a CUDA pinned host buffer for momenta random numbers
+  typedef PinnedHostBuffer<fptype, sizePerEventRndNumMomenta> PinnedHostBufferRndNumMomenta;
+  // A class encapsulating a CUDA device buffer for momenta random numbers
+  typedef DeviceBuffer<fptype, sizePerEventRndNumMomenta> DeviceBufferRndNumMomenta;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  /*
+  // A base class encapsulating a memory buffer with ONE fptype per event
+  typedef BufferBase<fptype> BufferOneFp;
+
+  // The size (number of elements) per event in a memory buffer with ONE fptype per event
+  constexpr size_t sizePerEventOneFp = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer with ONE fptype per event
+  typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventOneFp> PinnedHostBufferOneFp;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventOneFp> DeviceBufferOneFp;
+#endif
+
+  // Memory buffers for Gs (related to the event-by-event strength of running coupling constant alphas QCD)
+  typedef BufferOneFp BufferGs;
+  typedef HostBufferOneFp HostBufferGs;
+  typedef PinnedHostBufferOneFp PinnedHostBufferGs;
+  typedef DeviceBufferOneFp DeviceBufferGs;
+  */
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for Gs (related to the event-by-event strength of running coupling constant alphas QCD)
+  typedef BufferBase<fptype> BufferGs;
+
+  // The size (number of elements) per event in a memory buffer for Gs
+  constexpr size_t sizePerEventGs = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for gs
+  typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventGs> PinnedHostBufferGs;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventGs> DeviceBufferGs;
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  // A base class encapsulating a memory buffer for numerators (of the multichannel single-diagram enhancement factors)
+  typedef BufferBase<fptype> BufferNumerators;
+
+  // The size (number of elements) per event in a memory buffer for numerators
+  // (should be equal to the number of diagrams in the process)
+  constexpr size_t sizePerEventNumerators = processConfig::ndiagrams;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for numerators
+  typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
+#else
+  // A class encapsulating a CUDA pinned host buffer for numerators
+  typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
+  // A class encapsulating a CUDA device buffer for numerators
+  typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  // A base class encapsulating a memory buffer for denominators (of the multichannel single-diagram enhancement factors)
+  typedef BufferBase<fptype> BufferDenominators;
+
+  // The size (number of elements) per event in a memory buffer for denominators
+  constexpr size_t sizePerEventDenominators = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for denominators
+  typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
+#else
+  // A class encapsulating a CUDA pinned host buffer for denominators
+  typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
+  // A class encapsulating a CUDA device buffer for denominators
+  typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for couplings that depend on the event-by-event running coupling constant alphas QCD
+  typedef BufferBase<fptype> BufferCouplings;
+
+  // The size (number of elements) per event in a memory buffer for random numbers
+  constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for couplings
+  typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
+#else
+  // A class encapsulating a CUDA pinned host buffer for couplings
+  typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
+  // A class encapsulating a CUDA device buffer for couplings
+  typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for momenta
+  typedef BufferBase<fptype> BufferMomenta;
+
+  // The size (number of elements) per event in a memory buffer for momenta
+  constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for momenta
+  typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
+  //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
+#else
+  // A class encapsulating a CUDA pinned host buffer for momenta
+  typedef PinnedHostBuffer<fptype, sizePerEventMomenta> PinnedHostBufferMomenta;
+  // A class encapsulating a CUDA device buffer for momenta
+  typedef DeviceBuffer<fptype, sizePerEventMomenta> DeviceBufferMomenta;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for sampling weights
+  typedef BufferBase<fptype> BufferWeights;
+
+  // The size (number of elements) per event in a memory buffer for sampling weights
+  constexpr size_t sizePerEventWeights = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for sampling weights
+  typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
+#else
+  // A class encapsulating a CUDA pinned host buffer for sampling weights
+  typedef PinnedHostBuffer<fptype, sizePerEventWeights> PinnedHostBufferWeights;
+  // A class encapsulating a CUDA device buffer for sampling weights
+  typedef DeviceBuffer<fptype, sizePerEventWeights> DeviceBufferWeights;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for matrix elements
+  typedef BufferBase<fptype> BufferMatrixElements;
+
+  // The size (number of elements) per event in a memory buffer for matrix elements
+  constexpr size_t sizePerEventMatrixElements = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for matrix elements
+  typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
+#else
+  // A class encapsulating a CUDA pinned host buffer for matrix elements
+  typedef PinnedHostBuffer<fptype, sizePerEventMatrixElements> PinnedHostBufferMatrixElements;
+  // A class encapsulating a CUDA device buffer for matrix elements
+  typedef DeviceBuffer<fptype, sizePerEventMatrixElements> DeviceBufferMatrixElements;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for the helicity mask
+  typedef BufferBase<bool> BufferHelicityMask;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for the helicity mask
+  typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
+#else
+  // A class encapsulating a CUDA pinned host buffer for the helicity mask
+  typedef PinnedHostBufferBase<bool> PinnedHostBufferHelicityMask;
+  // A class encapsulating a CUDA device buffer for the helicity mask
+  typedef DeviceBufferBase<bool> DeviceBufferHelicityMask;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for wavefunctions
+  typedef BufferBase<fptype> BufferWavefunctions;
+
+  // The size (number of elements) per event in a memory buffer for wavefunctions
+  constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for wavefunctions
+  typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
+#else
+  // A class encapsulating a CUDA pinned host buffer for wavefunctions
+  typedef PinnedHostBuffer<fptype, sizePerEventWavefunctions> PinnedHostBufferWavefunctions;
+  // A class encapsulating a CUDA device buffer for wavefunctions
+  typedef DeviceBuffer<fptype, sizePerEventWavefunctions> DeviceBufferWavefunctions;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for helicity random numbers
+  typedef BufferBase<fptype> BufferRndNumHelicity;
+
+  // The size (number of elements) per event in a memory buffer for helicity random numbers
+  constexpr size_t sizePerEventRndNumHelicity = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for helicity random numbers
+  typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
+#else
+  // A class encapsulating a CUDA pinned host buffer for helicity random numbers
+  typedef PinnedHostBuffer<fptype, sizePerEventRndNumHelicity> PinnedHostBufferRndNumHelicity;
+  // A class encapsulating a CUDA device buffer for helicity random numbers
+  typedef DeviceBuffer<fptype, sizePerEventRndNumHelicity> DeviceBufferRndNumHelicity;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for color random numbers
+  typedef BufferBase<fptype> BufferRndNumColor;
+
+  // The size (number of elements) per event in a memory buffer for color random numbers
+  constexpr size_t sizePerEventRndNumColor = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for color random numbers
+  typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
+#else
+  // A class encapsulating a CUDA pinned host buffer for color random numbers
+  typedef PinnedHostBuffer<fptype, sizePerEventRndNumColor> PinnedHostBufferRndNumColor;
+  // A class encapsulating a CUDA device buffer for color random numbers
+  typedef DeviceBuffer<fptype, sizePerEventRndNumColor> DeviceBufferRndNumColor;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for channel ids
+  typedef BufferBase<unsigned int> BufferChannelIds;
+
+  // The size (number of elements) per event in a memory buffer for channel ids
+  constexpr size_t sizePerEventChannelId = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL // fix #893 (not __CUDACC__)
+  // A class encapsulating a C++ host buffer for channel ids
+  typedef HostBuffer<unsigned int, sizePerEventChannelId, HostBufferALIGNED> HostBufferChannelIds;
+#else
+  // A class encapsulating a CUDA pinned host buffer for channel ids
+  typedef PinnedHostBuffer<unsigned int, sizePerEventChannelId> PinnedHostBufferChannelIds;
+  // A class encapsulating a CUDA device buffer for channel ids
+  typedef DeviceBuffer<unsigned int, sizePerEventChannelId> DeviceBufferChannelIds;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for channel ids
+  typedef BufferBase<unsigned int> BufferIflavorVec;
+
+  // The size (number of elements) per event in a memory buffer for channel ids
+  constexpr size_t sizePerEventIflavorVec = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL // fix #893 (not __CUDACC__)
+  // A class encapsulating a C++ host buffer for channel ids
+  typedef HostBuffer<unsigned int, sizePerEventIflavorVec, HostBufferALIGNED> HostBufferIflavorVec;
+#else
+  // A class encapsulating a CUDA pinned host buffer for channel ids
+  typedef PinnedHostBuffer<unsigned int, sizePerEventIflavorVec> PinnedHostBufferIflavorVec;
+  // A class encapsulating a CUDA device buffer for channel ids
+  typedef DeviceBuffer<unsigned int, sizePerEventIflavorVec> DeviceBufferIflavorVec;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for helicity selection
+  typedef BufferBase<int> BufferSelectedHelicity;
+
+  // The size (number of elements) per event in a memory buffer for helicity selection
+  constexpr size_t sizePerEventSelectedHelicity = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for helicity selection
+  typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
+#else
+  // A class encapsulating a CUDA pinned host buffer for helicity selection
+  typedef PinnedHostBuffer<int, sizePerEventSelectedHelicity> PinnedHostBufferSelectedHelicity;
+  // A class encapsulating a CUDA device buffer for helicity selection
+  typedef DeviceBuffer<int, sizePerEventSelectedHelicity> DeviceBufferSelectedHelicity;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for color selection
+  typedef BufferBase<int> BufferSelectedColor;
+
+  // The size (number of elements) per event in a memory buffer for color selection
+  constexpr size_t sizePerEventSelectedColor = 1;
+
+#ifndef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a C++ host buffer for color selection
+  typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
+#else
+  // A class encapsulating a CUDA pinned host buffer for color selection
+  typedef PinnedHostBuffer<int, sizePerEventSelectedColor> PinnedHostBufferSelectedColor;
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventSelectedColor> DeviceBufferSelectedColor;
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  template<class Tdst, class Tsrc>
+  void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
+  {
+    if( dst.size() != src.size() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#elements) mismatch in copyDeviceFromHost: dst=" << dst.size() << ", src=" << src.size();
+      throw std::runtime_error( sstr.str() );
+    }
+    if( dst.bytes() != src.bytes() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#bytes) mismatch in copyDeviceFromHost: dst=" << dst.bytes() << ", src=" << src.bytes();
+      throw std::runtime_error( sstr.str() );
+    }
+    // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  template<class Tdst, class Tsrc>
+  void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
+  {
+    if( dst.size() != src.size() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#elements) mismatch in copyHostFromDevice: dst=" << dst.size() << ", src=" << src.size();
+      throw std::runtime_error( sstr.str() );
+    }
+    if( dst.bytes() != src.bytes() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#bytes) mismatch in copyHostFromDevice: dst=" << dst.bytes() << ", src=" << src.bytes();
+      throw std::runtime_error( sstr.str() );
+    }
+    // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
+    gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // MemoryBuffers_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
new file mode 100644
index 0000000000..e62c4dd482
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc
@@ -0,0 +1,183 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#include "RamboSamplingKernels.h"
+
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessRandomNumbers.h"
+#include "MemoryAccessWeights.h"
+#include "MemoryBuffers.h"
+#include "rambo.h" // inline implementation of RAMBO algorithms and kernels
+
+#include <sstream>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  RamboSamplingKernelHost::RamboSamplingKernelHost( const fptype energy,               // input: energy
+                                                    const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                                                    BufferMomenta& momenta,            // output: momenta
+                                                    BufferWeights& weights,            // output: weights
+                                                    const size_t nevt )
+    : SamplingKernelBase( energy, rndmom, momenta, weights )
+    , NumberOfEvents( nevt )
+  {
+    if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" );
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: momenta must be a host array" );
+    if( m_weights.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: weights must be a host array" );
+    if( this->nevt() != m_rndmom.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with rndmom" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with momenta" );
+    if( this->nevt() != m_weights.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with weights" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelHost: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Sanity checks for memory access (random number buffer)
+    constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout
+    static_assert( ispoweroftwo( neppR ), "neppR is not a power of 2" );
+    if( nevt % neppR != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelHost: nevt should be a multiple of neppR=" << neppR;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  void
+  RamboSamplingKernelHost::getMomentaInitial()
+  {
+    constexpr auto getMomentaInitial = ramboGetMomentaInitial<HostAccessMomenta>;
+    // ** START LOOP ON IEVT **
+    for( size_t ievt = 0; ievt < nevt(); ++ievt )
+    {
+      // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+      fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt );
+      getMomentaInitial( m_energy, ievtMomenta );
+    }
+    // ** END LOOP ON IEVT **
+  }
+
+  //--------------------------------------------------------------------------
+
+  void
+  RamboSamplingKernelHost::getMomentaFinal()
+  {
+    constexpr auto getMomentaFinal = ramboGetMomentaFinal<HostAccessRandomNumbers, HostAccessMomenta, HostAccessWeights>;
+    // ** START LOOP ON IEVT **
+    for( size_t ievt = 0; ievt < nevt(); ++ievt )
+    {
+      // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+      const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt );
+      fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt );
+      fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt );
+      getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights );
+    }
+    // ** END LOOP ON IEVT **
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
+                                                        const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                                                        BufferMomenta& momenta,            // output: momenta
+                                                        BufferWeights& weights,            // output: weights
+                                                        const size_t gpublocks,
+                                                        const size_t gputhreads )
+    : SamplingKernelBase( energy, rndmom, momenta, weights )
+    , NumberOfEvents( gpublocks * gputhreads )
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" );
+    if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" );
+    if( !m_weights.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: weights must be a device array" );
+    if( m_gpublocks == 0 ) throw std::runtime_error( "RamboSamplingKernelDevice: gpublocks must be > 0" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "RamboSamplingKernelDevice: gputhreads must be > 0" );
+    if( this->nevt() != m_rndmom.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with rndmom" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with momenta" );
+    if( this->nevt() != m_weights.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with weights" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( m_gputhreads % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelHost: gputhreads should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Sanity checks for memory access (random number buffer)
+    constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout
+    static_assert( ispoweroftwo( neppR ), "neppR is not a power of 2" );
+    if( m_gputhreads % neppR != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  getMomentaInitialDevice( const fptype energy,
+                           fptype* momenta )
+  {
+    constexpr auto getMomentaInitial = ramboGetMomentaInitial<DeviceAccessMomenta>;
+    return getMomentaInitial( energy, momenta );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  RamboSamplingKernelDevice::getMomentaInitial()
+  {
+    gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  getMomentaFinalDevice( const fptype energy,
+                         const fptype* rndmom,
+                         fptype* momenta,
+                         fptype* wgts )
+  {
+    constexpr auto getMomentaFinal = ramboGetMomentaFinal<DeviceAccessRandomNumbers, DeviceAccessMomenta, DeviceAccessWeights>;
+    return getMomentaFinal( energy, rndmom, momenta, wgts );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  RamboSamplingKernelDevice::getMomentaFinal()
+  {
+    gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
new file mode 100644
index 0000000000..a217619b9c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h
@@ -0,0 +1,134 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef RAMBOSAMPLINGKERNELS_H
+#define RAMBOSAMPLINGKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryBuffers.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating phase space sampling on a CPU host or on a GPU device
+  class SamplingKernelBase //: virtual public ISamplingKernel
+  {
+  protected:
+
+    // Constructor from existing input and output buffers
+    SamplingKernelBase( const fptype energy,               // input: energy
+                        const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                        BufferMomenta& momenta,            // output: momenta
+                        BufferWeights& weights )           // output: weights
+      : m_energy( energy )
+      , m_rndmom( rndmom )
+      , m_momenta( momenta )
+      , m_weights( weights )
+    {
+    }
+
+  public:
+
+    // Destructor
+    virtual ~SamplingKernelBase() {}
+
+    // Get momenta of initial state particles
+    virtual void getMomentaInitial() = 0;
+
+    // Get momenta of final state particles and weights
+    virtual void getMomentaFinal() = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The energy
+    const fptype m_energy;
+
+    // The buffer for the input random numbers
+    const BufferRndNumMomenta& m_rndmom;
+
+    // The buffer for the output momenta
+    BufferMomenta& m_momenta;
+
+    // The buffer for the output weights
+    BufferWeights& m_weights;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating RAMBO phase space sampling on a CPU host
+  class RamboSamplingKernelHost final : public SamplingKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    RamboSamplingKernelHost( const fptype energy,               // input: energy
+                             const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                             BufferMomenta& momenta,            // output: momenta
+                             BufferWeights& weights,            // output: weights
+                             const size_t nevt );
+
+    // Destructor
+    virtual ~RamboSamplingKernelHost() {}
+
+    // Get momenta of initial state particles
+    void getMomentaInitial() override final;
+
+    // Get momenta of final state particles and weights
+    void getMomentaFinal() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating RAMBO phase space sampling on a GPU device
+  class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    RamboSamplingKernelDevice( const fptype energy,               // input: energy
+                               const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                               BufferMomenta& momenta,            // output: momenta
+                               BufferWeights& weights,            // output: weights
+                               const size_t gpublocks,
+                               const size_t gputhreads );
+
+    // Destructor
+    virtual ~RamboSamplingKernelDevice() {}
+
+    // Get momenta of initial state particles
+    void getMomentaInitial() override final;
+
+    // Get momenta of final state particles and weights
+    void getMomentaFinal() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // RAMBOSAMPLINGKERNELS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
new file mode 100644
index 0000000000..7ed728a26c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h
@@ -0,0 +1,191 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef RANDOMNUMBERKERNELS_H
+#define RANDOMNUMBERKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryBuffers.h"
+
+// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc)
+struct curandGenerator_st;
+
+// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc)
+struct rocrand_generator_base_type;
+typedef rocrand_generator_base_type hiprandGenerator_st;
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  /*
+  // An interface encapsulating random number generation on a CPU host or on a GPU device
+  class IRandomNumberKernel
+  {
+  public:
+
+    // Destructor
+    virtual ~IRandomNumberKernel(){}
+
+    // Seed the random number generator
+    virtual void seedGenerator( const unsigned int seed ) = 0;
+
+    // Generate the random number array
+    virtual void generateRnarray() = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  };
+  */
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating random number generation on a CPU host or on a GPU device
+  class RandomNumberKernelBase //: virtual public IRandomNumberKernel
+  {
+
+  protected:
+
+    // Constructor from an existing output buffer
+    RandomNumberKernelBase( BufferRndNumMomenta& rnarray )
+      : m_rnarray( rnarray ) {}
+
+  public:
+
+    // Destructor
+    virtual ~RandomNumberKernelBase() {}
+
+    // Seed the random number generator
+    virtual void seedGenerator( const unsigned int seed ) = 0;
+
+    // Generate the random number array
+    virtual void generateRnarray() = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The buffer for the output random numbers
+    BufferRndNumMomenta& m_rnarray;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating common random number generation on a CPU host
+  class CommonRandomNumberKernel final : public RandomNumberKernelBase
+  {
+  public:
+
+    // Constructor from an existing output buffer
+    CommonRandomNumberKernel( BufferRndNumMomenta& rnarray );
+
+    // Destructor
+    ~CommonRandomNumberKernel() {}
+
+    // Seed the random number generator
+    void seedGenerator( const unsigned int seed ) override final { m_seed = seed; };
+
+    // Generate the random number array
+    void generateRnarray() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+  private:
+
+    // The generator seed
+    unsigned int m_seed;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating CURAND random number generation on a CPU host or on a GPU device
+  class CurandRandomNumberKernel final : public RandomNumberKernelBase
+  {
+  public:
+
+    // Constructor from an existing output buffer
+    CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice );
+
+    // Destructor
+    ~CurandRandomNumberKernel();
+
+    // Seed the random number generator
+    void seedGenerator( const unsigned int seed ) override final;
+
+    // Generate the random number array
+    void generateRnarray() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return m_isOnDevice; }
+
+  private:
+
+    // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor)
+    void createGenerator();
+
+    // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor)
+    void destroyGenerator();
+
+  private:
+
+    // Is this a host or device kernel?
+    const bool m_isOnDevice;
+
+    // The curand generator
+    // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st')
+    curandGenerator_st* m_rnGen;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device
+  class HiprandRandomNumberKernel final : public RandomNumberKernelBase
+  {
+  public:
+
+    // Constructor from an existing output buffer
+    HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice );
+
+    // Destructor
+    ~HiprandRandomNumberKernel();
+
+    // Seed the random number generator
+    void seedGenerator( const unsigned int seed ) override final;
+
+    // Generate the random number array
+    void generateRnarray() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return m_isOnDevice; }
+
+  private:
+
+    // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor)
+    void createGenerator();
+
+    // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor)
+    void destroyGenerator();
+
+  private:
+
+    // Is this a host or device kernel?
+    const bool m_isOnDevice;
+
+    // The hiprand generator
+    // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st')
+    hiprandGenerator_st* m_rnGen;
+  };
+
+  //--------------------------------------------------------------------------
+}
+#endif // RANDOMNUMBERKERNELS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
new file mode 100644
index 0000000000..b552dd0afe
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -0,0 +1,1243 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Thete, A. Valassi (2020-2026) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+
+#include "mgOnGpuConfig.h"
+
+#include "BridgeKernels.h"
+#include "CPPProcess.h"
+#include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
+#include "MatrixElementKernels.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessRandomNumbers.h"
+#include "MemoryAccessWeights.h"
+#include "MemoryBuffers.h"
+#include "RamboSamplingKernels.h"
+#include "RandomNumberKernels.h"
+#include "epoch_process_id.h"
+#include "ompnumthreads.h"
+#include "timermap.h"
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <string>
+
+#define STRINGIFY( s ) #s
+#define XSTRINGIFY( s ) STRINGIFY( s )
+
+#define SEP79 79
+
+bool
+is_number( const char* s )
+{
+  const char* t = s;
+  while( *t != '\0' && isdigit( *t ) )
+    ++t;
+  return (int)strlen( s ) == t - s;
+}
+
+int
+usage( char* argv0, int ret = 1 )
+{
+  std::cout << "Usage: " << argv0
+            << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]"
+            << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl;
+  std::cout << std::endl;
+  std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl;
+  std::cout << "(also in CPU/C++ code, where only the product of these two parameters counts)" << std::endl;
+  std::cout << std::endl;
+  std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
+  std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
+#ifndef MGONGPUCPP_GPUIMPL
+#ifdef _OPENMP
+  std::cout << std::endl;
+  std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
+  std::cout << "(OMP multithreading will be disabled if OMP_NUM_THREADS is not set)" << std::endl;
+#endif
+#endif
+  return ret;
+}
+
+int
+main( int argc, char** argv )
+{
+  // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  // DEFAULTS FOR COMMAND LINE ARGUMENTS
+  bool verbose = false;
+  bool debug = false;
+  bool perf = false;
+  bool json = false;
+  unsigned int niter = 0;
+  unsigned int gpublocks = 1;
+  unsigned int gputhreads = 32;
+  unsigned int jsondate = 0;
+  unsigned int jsonrun = 0;
+  unsigned int numvec[5] = { 0, 0, 0, 0, 0 };
+  int nnum = 0;
+  // Random number mode
+  enum class RandomNumberMode
+  {
+    CommonRandom = 0,
+    CurandHost = -1,
+    CurandDevice = 1,
+    HiprandHost = -2,
+    HiprandDevice = 2
+  };
+#if defined __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#ifndef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand
+#else
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785)
+#endif
+#elif defined __HIPCC__
+#ifndef MGONGPU_HAS_NO_HIPRAND
+  RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand
+#else
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand
+#endif
+#else
+#ifndef MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand
+#elif not defined MGONGPU_HAS_NO_HIPRAND
+  RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand
+#else
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand
+#endif
+#endif
+  // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
+  enum class RamboSamplingMode
+  {
+    RamboHost = 1,
+    RamboDevice = 2
+  };
+#ifdef MGONGPUCPP_GPUIMPL
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
+#else
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+#endif
+  // Bridge emulation mode (NB Bridge implies RamboHost!)
+  bool bridge = false;
+
+  // READ COMMAND LINE ARGUMENTS
+  for( int argn = 1; argn < argc; ++argn )
+  {
+    std::string arg = argv[argn];
+    if( ( arg == "--verbose" ) || ( arg == "-v" ) )
+    {
+      verbose = true;
+    }
+    else if( ( arg == "--debug" ) || ( arg == "-d" ) )
+    {
+      debug = true;
+    }
+    else if( ( arg == "--performance" ) || ( arg == "-p" ) )
+    {
+      perf = true;
+    }
+    else if( ( arg == "--json" ) || ( arg == "-j" ) )
+    {
+      json = true;
+    }
+    else if( arg == "--curdev" )
+    {
+#ifndef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" );
+#elif defined MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandDevice;
+#endif
+    }
+    else if( arg == "--curhst" )
+    {
+#ifdef MGONGPU_HAS_NO_CURAND
+      throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#else
+      rndgen = RandomNumberMode::CurandHost;
+#endif
+    }
+    else if( arg == "--hirdev" )
+    {
+#ifndef __HIPCC__
+      throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" );
+#elif defined MGONGPU_HAS_NO_HIPRAND
+      throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" );
+#else
+      rndgen = RandomNumberMode::HiprandDevice;
+#endif
+    }
+    else if( arg == "--hirhst" )
+    {
+#ifdef MGONGPU_HAS_NO_HIPRAND
+      throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" );
+#else
+      // See https://github.com/ROCm/hipRAND/issues/76
+      throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" );
+      //rndgen = RandomNumberMode::HiprandHost;
+#endif
+    }
+    else if( arg == "--common" )
+    {
+      rndgen = RandomNumberMode::CommonRandom;
+    }
+    else if( arg == "--rmbdev" )
+    {
+#ifdef MGONGPUCPP_GPUIMPL
+      rmbsmp = RamboSamplingMode::RamboDevice;
+#else
+      throw std::runtime_error( "RamboDevice is not supported on CPUs" );
+#endif
+    }
+    else if( arg == "--rmbhst" )
+    {
+      rmbsmp = RamboSamplingMode::RamboHost;
+    }
+    else if( arg == "--bridge" )
+    {
+      bridge = true;
+    }
+    else if( is_number( argv[argn] ) && nnum < 5 )
+    {
+      numvec[nnum++] = strtoul( argv[argn], NULL, 0 );
+    }
+    else
+    {
+      return usage( argv[0] );
+    }
+  }
+
+  if( nnum == 3 || nnum == 5 )
+  {
+    gpublocks = numvec[0];
+    gputhreads = numvec[1];
+    niter = numvec[2];
+    if( nnum == 5 )
+    {
+      jsondate = numvec[3];
+      jsonrun = numvec[4];
+    }
+  }
+  else if( nnum == 1 )
+  {
+    niter = numvec[0];
+  }
+  else
+  {
+    return usage( argv[0] );
+  }
+
+  if( niter == 0 )
+    return usage( argv[0] );
+
+  if( bridge && rmbsmp == RamboSamplingMode::RamboDevice )
+  {
+    std::cout << "WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost" << std::endl;
+    rmbsmp = RamboSamplingMode::RamboHost;
+  }
+
+  if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::CurandDevice )
+  {
+#if not defined MGONGPU_HAS_NO_CURAND
+    std::cout << "WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost" << std::endl;
+    rndgen = RandomNumberMode::CurandHost;
+#else
+    std::cout << "WARNING! RamboHost selected: cannot use CurandDevice, will use CommonRandom" << std::endl;
+    rndgen = RandomNumberMode::CommonRandom;
+#endif
+  }
+
+  if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice )
+  {
+#if not defined MGONGPU_HAS_NO_HIPRAND
+    // See https://github.com/ROCm/hipRAND/issues/76
+    //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl;
+    //rndgen = RandomNumberMode::HiprandHost;
+    std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl;
+    rndgen = RandomNumberMode::CommonRandom;
+#else
+    std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl;
+    rndgen = RandomNumberMode::CommonRandom;
+#endif
+  }
+
+  constexpr int neppM = MemoryAccessMomenta::neppM;       // AOSOA layout
+  constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout
+
+  using mgOnGpu::ntpbMAX;
+  if( gputhreads > ntpbMAX )
+  {
+    std::cout << "ERROR! #threads/block should be <= " << ntpbMAX << std::endl;
+    return usage( argv[0] );
+  }
+
+#ifndef MGONGPUCPP_GPUIMPL
+#ifdef _OPENMP
+  ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
+#endif
+#endif
+
+  const unsigned int ndim = gpublocks * gputhreads; // number of threads in one GPU grid
+  const unsigned int nevt = ndim;                   // number of events in one iteration == number of GPU threads
+
+  if( verbose )
+    std::cout << "# iterations: " << niter << std::endl;
+
+  // *** START THE NEW TIMERS ***
+  mgOnGpu::TimerMap timermap;
+
+  // === STEP 0 - INITIALISE
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
+  timermap.start( cdinKey );
+  GpuRuntime GpuRuntime( debug );
+#endif
+
+  // --- 0a. Initialise physics process
+  const std::string procKey = "0a ProcInit";
+  timermap.start( procKey );
+
+  // Create a process object, read param card and set parameters
+  // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+  // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
+  CPPProcess process( verbose );
+  process.initProc( "../../Cards/param_card.dat" );
+  const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
+  //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering)
+  const int meGeVexponent = -( 2 * CPPProcess::npar - 8 );
+
+  // --- 0b. Allocate memory structures
+  const std::string alloKey = "0b MemAlloc";
+  timermap.start( alloKey );
+
+  // Memory buffers for random numbers for momenta
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferRndNumMomenta hstRndmom( nevt );
+#else
+  PinnedHostBufferRndNumMomenta hstRndmom( nevt );
+  DeviceBufferRndNumMomenta devRndmom( nevt );
+#endif
+
+  // Memory buffers for sampling weights
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferWeights hstWeights( nevt );
+#else
+  PinnedHostBufferWeights hstWeights( nevt );
+  DeviceBufferWeights devWeights( nevt );
+#endif
+
+  // Memory buffers for momenta
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferMomenta hstMomenta( nevt );
+#else
+  PinnedHostBufferMomenta hstMomenta( nevt );
+  DeviceBufferMomenta devMomenta( nevt );
+#endif
+
+  // Memory buffers for Gs
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferGs hstGs( nevt );
+#else
+  PinnedHostBufferGs hstGs( nevt );
+  DeviceBufferGs devGs( nevt );
+#endif
+
+  // Memory buffer for channelIDs
+  // [AV: channelId arrays are needed to keep a simpler signature for MatrixElementKernel constructors]
+  // [but they are not used internally (fix #892) as long as check.exe uses no-multichannel (see #896)]
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferChannelIds hstChannelIds( nevt );
+#else
+  PinnedHostBufferChannelIds hstChannelIds( nevt );
+  DeviceBufferChannelIds devChannelIds( nevt );
+#endif
+
+  // Memory buffer for iflavorVec
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferIflavorVec hstIflavorVec( nevt );
+#else
+  PinnedHostBufferIflavorVec hstIflavorVec( nevt );
+  DeviceBufferIflavorVec devIflavorVec( nevt );
+#endif
+
+  // Hardcode Gs for now (eventually they should come from Fortran MadEvent)
+  // Hardcode channelID to 0
+  //constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466
+  for( unsigned int i = 0; i < nevt; ++i )
+  {
+    constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+    hstGs[i] = fixedG;
+    //hstChannelIds[i] = channelId; // AV ChannelId arrays are not needed in check.exe (fix #892) as long as check.exe uses no-multichannel (see #896)
+    //if ( i > 0 ) hstGs[i] = 0; // try hardcoding G only for event 0
+    //hstGs[i] = i;
+    hstIflavorVec[i] = 1; // Fill with 1, all equal and same flavor combination
+  }
+
+  // Memory buffers for matrix elements
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferMatrixElements hstMatrixElements( nevt );
+#else
+  PinnedHostBufferMatrixElements hstMatrixElements( nevt );
+  DeviceBufferMatrixElements devMatrixElements( nevt );
+#endif
+
+  // Memory buffers for random numbers for helicity selection
+  // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferRndNumHelicity hstRndHel( nevt );
+#else
+  PinnedHostBufferRndNumHelicity hstRndHel( nevt );
+  DeviceBufferRndNumHelicity devRndHel( nevt );
+#endif
+
+  // Memory buffers for random numbers for color selection
+  // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferRndNumColor hstRndCol( nevt );
+#else
+  PinnedHostBufferRndNumColor hstRndCol( nevt );
+  DeviceBufferRndNumColor devRndCol( nevt );
+#endif
+
+  // Memory buffers for helicity selection
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferSelectedHelicity hstSelHel( nevt );
+#else
+  PinnedHostBufferSelectedHelicity hstSelHel( nevt );
+  DeviceBufferSelectedHelicity devSelHel( nevt );
+#endif
+
+  // Memory buffers for color selection
+#ifndef MGONGPUCPP_GPUIMPL
+  HostBufferSelectedColor hstSelCol( nevt );
+#else
+  PinnedHostBufferSelectedColor hstSelCol( nevt );
+  DeviceBufferSelectedColor devSelCol( nevt );
+#endif
+
+  std::unique_ptr<double[]> genrtimes( new double[niter] );
+  std::unique_ptr<double[]> rambtimes( new double[niter] );
+  std::unique_ptr<double[]> wavetimes( new double[niter] );
+  std::unique_ptr<double[]> wv3atimes( new double[niter] );
+
+  // --- 0c. Create curand, hiprand or common generator
+  const std::string cgenKey = "0c GenCreat";
+  timermap.start( cgenKey );
+  // Allocate the appropriate RandomNumberKernel
+  std::unique_ptr<RandomNumberKernelBase> prnk;
+  if( rndgen == RandomNumberMode::CommonRandom )
+  {
+    prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
+  }
+  else if( rndgen == RandomNumberMode::CurandHost )
+  {
+#ifdef MGONGPU_HAS_NO_CURAND
+    throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#else
+    const bool onDevice = false;
+    prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
+  }
+  else if( rndgen == RandomNumberMode::CurandDevice )
+  {
+#ifdef MGONGPU_HAS_NO_CURAND /* clang-format off */
+    throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+    const bool onDevice = true;
+    prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
+#else
+    throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" );  // INTERNAL ERROR (no path to this statement)
+#endif /* clang-format on */
+  }
+  else if( rndgen == RandomNumberMode::HiprandHost )
+  {
+#ifdef MGONGPU_HAS_NO_HIPRAND
+    throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement)
+#else
+    const bool onDevice = false;
+    prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) );
+#endif
+  }
+  else if( rndgen == RandomNumberMode::HiprandDevice )
+  {
+#ifdef MGONGPU_HAS_NO_HIPRAND
+    throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement)
+#elif defined __HIPCC__
+    const bool onDevice = true;
+    prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) );
+#else
+    throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement)
+#endif
+  }
+  else
+    throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement)
+
+  // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
+  std::unique_ptr<SamplingKernelBase> prsk;
+  if( rmbsmp == RamboSamplingMode::RamboHost )
+  {
+    prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) );
+  }
+  else
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
+#else
+    throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+#endif
+  }
+
+  // --- 0c. Create matrix element kernel [keep this in 0c for the moment]
+  std::unique_ptr<MatrixElementKernelBase> pmek;
+  if( !bridge )
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devIflavorVec, devRndHel, devRndCol, devChannelIds, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads) );
+#else
+    pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstIflavorVec, hstRndHel, hstRndCol, hstChannelIds, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
+#endif
+  }
+  else
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstIflavorVec, hstRndHel, hstRndCol, hstChannelIds, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
+#else
+    pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstIflavorVec, hstRndHel, hstRndCol, hstChannelIds, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
+#endif
+  }
+  int nGoodHel = 0; // the number of good helicities (out of ncomb)
+
+  // --- 0c. Create cross section kernel [keep this in 0c for the moment]
+  EventStatistics hstStats;
+  CrossSectionKernelHost xsk( hstWeights, hstMatrixElements, hstStats, nevt );
+
+  // **************************************
+  // *** START MAIN LOOP ON #ITERATIONS ***
+  // **************************************
+
+  for( unsigned long int iiter = 0; iiter < niter; ++iiter )
+  {
+    //std::cout << "Iteration #" << iiter+1 << " of " << niter << std::endl;
+
+    // === STEP 1 OF 3
+
+    // *** START THE OLD-STYLE TIMER FOR RANDOM GEN ***
+    double genrtime = 0;
+
+    // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand)
+    // [NB This should not be necessary using the host API: "Generation functions
+    // can be called multiple times on the same generator to generate successive
+    // blocks of results. For pseudorandom generators, multiple calls to generation
+    // functions will yield the same result as a single call with a large size."]
+    const unsigned long long seed = 20200805;
+    const std::string sgenKey = "1a GenSeed ";
+    timermap.start( sgenKey );
+    prnk->seedGenerator( seed + iiter );
+    genrtime += timermap.stop();
+
+    // --- 1b. Generate all relevant numbers to build nevt events (i.e. nevt phase space points) on the host
+    const std::string rngnKey = "1b GenRnGen";
+    timermap.start( rngnKey );
+    prnk->generateRnarray();
+    //std::cout << "Got random numbers" << std::endl;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    if( rndgen != RandomNumberMode::CurandDevice &&
+        rndgen != RandomNumberMode::HiprandDevice &&
+        rmbsmp == RamboSamplingMode::RamboDevice )
+    {
+      // --- 1c. Copy rndmom from host to device
+      const std::string htodKey = "1c CpHTDrnd";
+      genrtime += timermap.start( htodKey );
+      copyDeviceFromHost( devRndmom, hstRndmom );
+    }
+#endif
+
+    // *** STOP THE OLD-STYLE TIMER FOR RANDOM GEN ***
+    genrtime += timermap.stop();
+
+    // === STEP 2 OF 3
+    // Fill in particle momenta for each of nevt events on the device
+
+    // *** START THE OLD-STYLE TIMER FOR RAMBO ***
+    double rambtime = 0;
+
+    // --- 2a. Fill in momenta of initial state particles on the device
+    const std::string riniKey = "2a RamboIni";
+    timermap.start( riniKey );
+    prsk->getMomentaInitial();
+    //std::cout << "Got initial momenta" << std::endl;
+
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    const std::string rfinKey = "2b RamboFin";
+    rambtime += timermap.start( rfinKey );
+    prsk->getMomentaFinal();
+    //std::cout << "Got final momenta" << std::endl;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    if( rmbsmp == RamboSamplingMode::RamboDevice )
+    {
+      // --- 2c. CopyDToH Weights
+      const std::string cwgtKey = "2c CpDTHwgt";
+      rambtime += timermap.start( cwgtKey );
+      copyHostFromDevice( hstWeights, devWeights );
+
+      // --- 2d. CopyDToH Momenta
+      const std::string cmomKey = "2d CpDTHmom";
+      rambtime += timermap.start( cmomKey );
+      copyHostFromDevice( hstMomenta, devMomenta );
+    }
+    else // only if ( ! bridge ) ???
+    {
+      // --- 2c. CopyHToD Weights
+      const std::string cwgtKey = "2c CpHTDwgt";
+      rambtime += timermap.start( cwgtKey );
+      copyDeviceFromHost( devWeights, hstWeights );
+
+      // --- 2d. CopyHToD Momenta
+      const std::string cmomKey = "2d CpHTDmom";
+      rambtime += timermap.start( cmomKey );
+      copyDeviceFromHost( devMomenta, hstMomenta );
+    }
+#endif
+
+    // *** STOP THE OLD-STYLE TIMER FOR RAMBO ***
+    rambtime += timermap.stop();
+
+    // === STEP 3 OF 3
+    // Evaluate matrix elements for all nevt events
+    // 0d. For Bridge only, transpose C2F [renamed as 0d: this is not initialisation, but I want it out of the ME timers (#371)]
+    // 0e. (Only on the first iteration) Get good helicities [renamed as 0e: this IS initialisation!]
+    // 3a. Evaluate MEs on the device (include transpose F2C for Bridge)
+    // 3b. Copy MEs back from device to host
+
+    // --- 0d. TransC2F
+    if( bridge )
+    {
+      const std::string tc2fKey = "0d TransC2F";
+      timermap.start( tc2fKey );
+      dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    // --- 2d. CopyHToD Momenta
+    const std::string gKey = "0.. CpHTDg";
+    rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
+    copyDeviceFromHost( devGs, hstGs );
+    copyDeviceFromHost( devIflavorVec, hstIflavorVec );
+    copyDeviceFromHost( devChannelIds, hstChannelIds );
+#endif
+
+    // --- 0e. SGoodHel
+    if( iiter == 0 )
+    {
+      const std::string ghelKey = "0e SGoodHel";
+      timermap.start( ghelKey );
+      nGoodHel = pmek->computeGoodHelicities();
+    }
+
+    // *** START THE OLD-STYLE TIMERS FOR MATRIX ELEMENTS (WAVEFUNCTIONS) ***
+    double wavetime = 0; // calc plus copy
+    double wv3atime = 0; // calc only
+
+    // --- 3a. SigmaKin
+    const std::string skinKey = "3a SigmaKin";
+    timermap.start( skinKey );
+    constexpr bool useChannelIds = false; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466
+    pmek->computeMatrixElements( useChannelIds );
+
+    // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) ***
+    wv3atime += timermap.stop(); // calc only
+    wavetime += wv3atime;        // calc plus copy
+
+#ifdef MGONGPUCPP_GPUIMPL
+    if( !bridge )
+    {
+      // --- 3b. CopyDToH MEs
+      const std::string cmesKey = "3b CpDTHmes";
+      timermap.start( cmesKey );
+      copyHostFromDevice( hstMatrixElements, devMatrixElements );
+      // *** STOP THE OLD OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) ***
+      wavetime += timermap.stop(); // calc plus copy
+    }
+#endif
+
+    // === STEP 4 FINALISE LOOP
+    // --- 4@ Update event statistics
+    const std::string updtKey = "4@ UpdtStat";
+    timermap.start( updtKey );
+    xsk.updateEventStatistics();
+
+    // --- 4a Dump within the loop
+    const std::string loopKey = "4a DumpLoop";
+    timermap.start( loopKey );
+    genrtimes[iiter] = genrtime;
+    rambtimes[iiter] = rambtime;
+    wavetimes[iiter] = wavetime;
+    wv3atimes[iiter] = wv3atime;
+
+    if( verbose )
+    {
+      std::cout << std::string( SEP79, '*' ) << std::endl
+                << "Iteration #" << iiter + 1 << " of " << niter << std::endl;
+      if( perf ) std::cout << "Wave function time: " << wavetime << std::endl;
+    }
+
+    for( unsigned int ievt = 0; ievt < nevt; ++ievt ) // Loop over all events in this iteration
+    {
+      if( verbose )
+      {
+        // Display momenta
+        std::cout << "Momenta:" << std::endl;
+        for( int ipar = 0; ipar < CPPProcess::npar; ipar++ )
+        {
+          // NB: 'setw' affects only the next field (of any type)
+          std::cout << std::scientific // fixed format: affects all floats (default precision: 6)
+                    << std::setw( 4 ) << ipar + 1
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 0, ipar )
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 1, ipar )
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 2, ipar )
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 3, ipar )
+                    << std::endl
+                    << std::defaultfloat; // default format: affects all floats
+        }
+        std::cout << std::string( SEP79, '-' ) << std::endl;
+        // Display matrix elements
+        std::cout << " Matrix element = " << MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt )
+                  << " GeV^" << meGeVexponent << std::endl;
+        std::cout << std::string( SEP79, '-' ) << std::endl;
+      }
+    }
+
+    if( !( verbose || debug || perf ) )
+    {
+      std::cout << ".";
+    }
+  }
+
+  // **************************************
+  // *** END MAIN LOOP ON #ITERATIONS ***
+  // **************************************
+
+  // === STEP 8 ANALYSIS
+  // --- 8a Analysis: compute stats after the loop
+  const std::string statKey = "8a CompStat";
+  timermap.start( statKey );
+
+  double sumgtim = 0;
+  //double sqsgtim = 0;
+  double mingtim = genrtimes[0];
+  double maxgtim = genrtimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumgtim += genrtimes[iiter];
+    //sqsgtim += genrtimes[iiter]*genrtimes[iiter];
+    mingtim = std::min( mingtim, genrtimes[iiter] );
+    maxgtim = std::max( maxgtim, genrtimes[iiter] );
+  }
+
+  double sumrtim = 0;
+  //double sqsrtim = 0;
+  double minrtim = rambtimes[0];
+  double maxrtim = rambtimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumrtim += rambtimes[iiter];
+    //sqsrtim += rambtimes[iiter]*rambtimes[iiter];
+    minrtim = std::min( minrtim, rambtimes[iiter] );
+    maxrtim = std::max( maxrtim, rambtimes[iiter] );
+  }
+
+  double sumwtim = 0;
+  //double sqswtim = 0;
+  double minwtim = wavetimes[0];
+  double maxwtim = wavetimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumwtim += wavetimes[iiter];
+    //sqswtim += wavetimes[iiter]*wavetimes[iiter];
+    minwtim = std::min( minwtim, wavetimes[iiter] );
+    maxwtim = std::max( maxwtim, wavetimes[iiter] );
+  }
+  double meanwtim = sumwtim / niter;
+  //double stdwtim = std::sqrt( sqswtim / niter - meanwtim * meanwtim );
+
+  double sumw3atim = 0;
+  //double sqsw3atim = 0;
+  double minw3atim = wv3atimes[0];
+  double maxw3atim = wv3atimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumw3atim += wv3atimes[iiter];
+    //sqsw3atim += wv3atimes[iiter]*wv3atimes[iiter];
+    minw3atim = std::min( minw3atim, wv3atimes[iiter] );
+    maxw3atim = std::max( maxw3atim, wv3atimes[iiter] );
+  }
+  double meanw3atim = sumw3atim / niter;
+  //double stdw3atim = std::sqrt( sqsw3atim / niter - meanw3atim * meanw3atim );
+
+  const unsigned int nevtALL = hstStats.nevtALL; // total number of ALL events in all iterations
+  if( nevtALL != niter * nevt )
+    std::cout << "ERROR! nevtALL mismatch " << nevtALL << " != " << niter * nevt << std::endl; // SANITY CHECK
+  int nabn = hstStats.nevtABN;
+  int nzero = hstStats.nevtZERO;
+
+  // === STEP 9 FINALISE
+
+  std::string rndgentxt;
+  if( rndgen == RandomNumberMode::CommonRandom )
+    rndgentxt = "COMMON RANDOM HOST";
+  else if( rndgen == RandomNumberMode::CurandHost )
+    rndgentxt = "CURAND HOST";
+  else if( rndgen == RandomNumberMode::CurandDevice )
+    rndgentxt = "CURAND DEVICE";
+  else if( rndgen == RandomNumberMode::HiprandHost )
+    rndgentxt = "ROCRAND HOST";
+  else if( rndgen == RandomNumberMode::HiprandDevice )
+    rndgentxt = "ROCRAND DEVICE";
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+  rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
+#else
+  rndgentxt += " (C++ code)";
+#endif
+
+  // Workflow description summary
+  std::string wrkflwtxt;
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+  wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
+#else
+  wrkflwtxt += "CPP:";
+#endif /* clang-format off */
+  // -- DOUBLE or FLOAT?
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
+#elif defined MGONGPU_FPTYPE_DOUBLE
+  wrkflwtxt += "DBL+";
+#elif defined MGONGPU_FPTYPE_FLOAT
+  wrkflwtxt += "FLT+";
+#else
+  wrkflwtxt += "???+"; // no path to this statement
+#endif
+  // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers?
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+  wrkflwtxt += "CUX:";
+#elif defined MGONGPU_CUCXTYPE_THRUST
+  wrkflwtxt += "THX:";
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
+#elif defined __HIPCC__
+#if defined MGONGPU_HIPCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
+#else
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+  wrkflwtxt += "STX:";
+#elif defined MGONGPU_CPPCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif /* clang-format on */
+#endif
+  // -- COMMON or CURAND HOST or CURAND DEVICE random numbers?
+  if( rndgen == RandomNumberMode::CommonRandom )
+    wrkflwtxt += "COMMON+";
+  else if( rndgen == RandomNumberMode::CurandHost )
+    wrkflwtxt += "CURHST+";
+  else if( rndgen == RandomNumberMode::CurandDevice )
+    wrkflwtxt += "CURDEV+";
+  else if( rndgen == RandomNumberMode::HiprandHost )
+    wrkflwtxt += "HIRHST+";
+  else if( rndgen == RandomNumberMode::HiprandDevice )
+    wrkflwtxt += "HIRDEV+";
+  else
+    wrkflwtxt += "??????+"; // no path to this statement
+  // -- HOST or DEVICE rambo sampling?
+  if( rmbsmp == RamboSamplingMode::RamboHost )
+    wrkflwtxt += "RMBHST+";
+  else if( rmbsmp == RamboSamplingMode::RamboDevice )
+    wrkflwtxt += "RMBDEV+";
+  else
+    wrkflwtxt += "??????+"; // no path to this statement
+#ifdef MGONGPUCPP_GPUIMPL
+  // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
+  if( !bridge )
+    wrkflwtxt += "MESDEV";
+  else
+    wrkflwtxt += "BRDDEV";
+#else
+  if( !bridge )
+    wrkflwtxt += "MESHST"; // FIXME! allow this also in CUDA (eventually with various simd levels)
+  else
+    wrkflwtxt += "BRDHST";
+#endif
+    // -- SIMD matrix elements?
+#if !defined MGONGPU_CPPSIMD
+  wrkflwtxt += "/none";
+#elif defined __AVX512VL__
+#ifdef MGONGPU_PVW512
+  wrkflwtxt += "/512z";
+#else
+  wrkflwtxt += "/512y";
+#endif
+#elif defined __AVX2__
+  wrkflwtxt += "/avx2";
+#elif defined __SSE4_2__
+#ifdef __PPC__
+  wrkflwtxt += "/ppcv";
+#elif defined __ARM_NEON__
+  wrkflwtxt += "/neon";
+#else
+  wrkflwtxt += "/sse4";
+#endif
+#else
+  wrkflwtxt += "/????";                                           // no path to this statement
+#endif
+  // -- Has cxtype_v::operator[] bracket with non-const reference?
+#if defined MGONGPU_CPPSIMD
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  wrkflwtxt += "+CXVBRK";
+#else
+  wrkflwtxt += "+NOVBRK";
+#endif
+#else
+  wrkflwtxt += "+NAVBRK"; // N/A
+#endif
+
+  // --- 9a Dump to screen
+  const std::string dumpKey = "9a DumpScrn";
+  timermap.start( dumpKey );
+
+  if( !( verbose || debug || perf ) )
+  {
+    std::cout << std::endl;
+  }
+
+  if( perf )
+  {
+#ifndef MGONGPUCPP_GPUIMPL
+#ifdef _OPENMP
+    // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
+    std::string nprocall;
+    std::unique_ptr<FILE, decltype( &pclose )> nprocpipe( popen( "nproc --all", "r" ), pclose );
+    if( !nprocpipe ) throw std::runtime_error( "`nproc --all` failed?" );
+    std::array<char, 128> nprocbuf;
+    while( fgets( nprocbuf.data(), nprocbuf.size(), nprocpipe.get() ) != nullptr ) nprocall += nprocbuf.data();
+#endif
+#endif
+#ifdef MGONGPU_CPPSIMD
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    const std::string cxtref = " [cxtype_ref=YES]";
+#else
+    const std::string cxtref = " [cxtype_ref=NO]";
+#endif
+#endif
+    // Dump all configuration parameters and all results
+    std::cout << std::string( SEP79, '*' ) << std::endl
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
+#else
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
+#endif
+              << " [" << process.getCompiler() << "]"
+#ifdef MGONGPU_INLINE_HELAMPS
+              << " [inlineHel=1]"
+#else
+              << " [inlineHel=0]"
+#endif
+#ifdef MGONGPU_HARDCODE_PARAM
+              << " [hardcodePARAM=1]" << std::endl
+#else
+              << " [hardcodePARAM=0]" << std::endl
+#endif
+              << "NumBlocksPerGrid            = " << gpublocks << std::endl
+              << "NumThreadsPerBlock          = " << gputhreads << std::endl
+              << "NumIterations               = " << niter << std::endl
+              << std::string( SEP79, '-' ) << std::endl;
+    std::cout << "Workflow summary            = " << wrkflwtxt << std::endl
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+              << "FP precision                = MIXED (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
+#elif defined MGONGPU_FPTYPE_DOUBLE
+              << "FP precision                = DOUBLE (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
+#elif defined MGONGPU_FPTYPE_FLOAT
+              << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
+#endif
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+              << "Complex type                = CUCOMPLEX" << std::endl
+#elif defined MGONGPU_CUCXTYPE_THRUST
+              << "Complex type                = THRUST::COMPLEX" << std::endl
+#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+              << "Complex type                = CXSIMPLE" << std::endl
+#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+              << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
+#endif
+              << "RanNumb memory layout       = AOSOA[" << neppR << "]"
+              << ( neppR == 1 ? " == AOS" : "" )
+              << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
+              << "Momenta memory layout       = AOSOA[" << neppM << "]"
+              << ( neppM == 1 ? " == AOS" : "" ) << std::endl
+#ifdef MGONGPUCPP_GPUIMPL
+    //<< "Wavefunction GPU memory     = LOCAL" << std::endl
+#else
+#if !defined MGONGPU_CPPSIMD
+              << "Internal loops fptype_sv    = SCALAR ('none': ~vector[" << neppV
+              << "], no SIMD)" << std::endl
+#elif defined __AVX512VL__
+#ifdef MGONGPU_PVW512
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+              << "] ('512z': AVX512, 512bit)" << cxtref << std::endl
+#else
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+              << "] ('512y': AVX512, 256bit)" << cxtref << std::endl
+#endif
+#elif defined __AVX2__
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+              << "] ('avx2': AVX2, 256bit)" << cxtref << std::endl
+#elif defined __SSE4_2__
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+#ifdef __PPC__
+              << "] ('sse4': PPC VSX, 128bit)" << cxtref << std::endl
+#elif defined __ARM_NEON__
+              << "] ('sse4': ARM NEON, 128bit)" << cxtref << std::endl
+#else
+              << "] ('sse4': SSE4.2, 128bit)" << cxtref << std::endl
+#endif
+#else
+#error Internal error: unknown SIMD build configuration
+#endif
+#endif
+              << "Random number generation    = " << rndgentxt << std::endl
+#ifndef MGONGPUCPP_GPUIMPL
+#ifdef _OPENMP
+              << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
+#endif
+#endif
+              //<< "MatrixElements compiler     = " << process.getCompiler() << std::endl
+              << std::string( SEP79, '-' ) << std::endl
+              << "HelicityComb Good/Tot       = " << nGoodHel << "/" << CPPProcess::ncomb << std::endl
+              << std::string( SEP79, '-' ) << std::endl
+              << "NumberOfEntries             = " << niter << std::endl
+              << std::scientific // fixed format: affects all floats (default precision: 6)
+              << "TotalTime[Rnd+Rmb+ME] (123) = ( " << sumgtim + sumrtim + sumwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[Rambo+ME]    (23) = ( " << sumrtim + sumwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[RndNumGen]    (1) = ( " << sumgtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[Rambo]        (2) = ( " << sumrtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[MatrixElems]  (3) = ( " << sumwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "MeanTimeInMatrixElems       = ( " << meanwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "[Min,Max]TimeInMatrixElems  = [ " << minwtim
+              << " ,  " << maxwtim << " ]  sec" << std::endl
+              //<< "StdDevTimeInMatrixElems     = ( " << stdwtim << std::string(16, ' ') << " )  sec" << std::endl
+              << "TotalTime[MECalcOnly]  (3a) = ( " << sumw3atim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "MeanTimeInMECalcOnly        = ( " << meanw3atim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "[Min,Max]TimeInMECalcOnly   = [ " << minw3atim
+              << " ,  " << maxw3atim << " ]  sec" << std::endl
+              //<< "StdDevTimeInMECalcOnly      = ( " << stdw3atim << std::string(16, ' ') << " )  sec" << std::endl
+              << std::string( SEP79, '-' ) << std::endl
+              //<< "ProcessID:                  = " << getpid() << std::endl
+              //<< "NProcesses                  = " << process.nprocesses << std::endl // assume nprocesses == 1 (#272 and #343)
+              << "TotalEventsComputed         = " << nevtALL << std::endl
+              << "EvtsPerSec[Rnd+Rmb+ME](123) = ( " << nevtALL / ( sumgtim + sumrtim + sumwtim )
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              << "EvtsPerSec[Rmb+ME]     (23) = ( " << nevtALL / ( sumrtim + sumwtim )
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              //<< "EvtsPerSec[RndNumGen]   (1) = ( " << nevtALL/sumgtim
+              //<< std::string(16, ' ') << " )  sec^-1" << std::endl
+              //<< "EvtsPerSec[Rambo]        (2) = ( " << nevtALL/sumrtim
+              //<< std::string(16, ' ') << " )  sec^-1" << std::endl
+              << "EvtsPerSec[MatrixElems] (3) = ( " << nevtALL / sumwtim
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              << "EvtsPerSec[MECalcOnly] (3a) = ( " << nevtALL / sumw3atim
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              << std::defaultfloat; // default format: affects all floats
+    std::cout << std::string( SEP79, '*' ) << std::endl
+              << hstStats;
+  }
+
+  // --- 9b Dump to json
+  const std::string jsonKey = "9b DumpJson";
+  timermap.start( jsonKey );
+
+  if( json )
+  {
+    std::string jsonFileName = std::to_string( jsondate ) + "-perf-test-run" + std::to_string( jsonrun ) + ".json";
+    jsonFileName = "./perf/data/" + jsonFileName;
+
+    //Checks if file exists
+    std::ifstream fileCheck;
+    bool fileExists = false;
+    fileCheck.open( jsonFileName );
+    if( fileCheck )
+    {
+      fileExists = true;
+      fileCheck.close();
+    }
+
+    std::ofstream jsonFile;
+    jsonFile.open( jsonFileName, std::ios_base::app );
+    if( !fileExists )
+    {
+      jsonFile << "[" << std::endl;
+    }
+    else
+    {
+      //deleting the last bracket and outputting a ", "
+      std::string temp = "truncate -s-1 " + jsonFileName;
+      const char* command = temp.c_str();
+      if( system( command ) != 0 )
+        std::cout << "WARNING! Command '" << temp << "' failed" << std::endl;
+      jsonFile << ", " << std::endl;
+    }
+
+    jsonFile << "{" << std::endl
+             << "\"NumIterations\": " << niter << ", " << std::endl
+             << "\"NumThreadsPerBlock\": " << gputhreads << ", " << std::endl
+             << "\"NumBlocksPerGrid\": " << gpublocks << ", " << std::endl
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+             << "\"FP precision\": "
+             << "\"MIXED (NaN/abnormal=" << nabn << ")\"," << std::endl
+#elif defined MGONGPU_FPTYPE_DOUBLE
+             << "\"FP precision\": "
+             << "\"DOUBLE (NaN/abnormal=" << nabn << ")\"," << std::endl
+#elif defined MGONGPU_FPTYPE_FLOAT
+             << "\"FP precision\": "
+             << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
+#endif
+             << "\"Complex type\": "
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+             << "\"CUCOMPLEX\"," << std::endl
+#elif defined MGONGPU_CUCXTYPE_THRUST
+             << "\"THRUST::COMPLEX\"," << std::endl
+#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+             << "\"CXSIMPLE\"," << std::endl
+#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
+             << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
+#endif
+             << "\"RanNumb memory layout\": "
+             << "\"AOSOA[" << neppR << "]\""
+             << ( neppR == 1 ? " == AOS" : "" ) << ", " << std::endl
+             << "\"Momenta memory layout\": "
+             << "\"AOSOA[" << neppM << "]\""
+             << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
+#ifdef MGONGPUCPP_GPUIMPL
+    //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
+#endif
+             << "\"Random generation\": "
+             << "\"" << rndgentxt << "\"," << std::endl;
+
+    double minelem = hstStats.minME;
+    double maxelem = hstStats.maxME;
+    double meanelem = hstStats.meanME();
+    double stdelem = hstStats.stdME();
+
+    jsonFile << "\"NumberOfEntries\": " << niter << "," << std::endl
+             //<< std::scientific // Not sure about this
+             << "\"TotalTime[Rnd+Rmb+ME] (123)\": \""
+             << std::to_string( sumgtim + sumrtim + sumwtim ) << " sec\","
+             << std::endl
+             << "\"TotalTime[Rambo+ME] (23)\": \""
+             << std::to_string( sumrtim + sumwtim ) << " sec\"," << std::endl
+             << "\"TotalTime[RndNumGen] (1)\": \""
+             << std::to_string( sumgtim ) << " sec\"," << std::endl
+             << "\"TotalTime[Rambo] (2)\": \""
+             << std::to_string( sumrtim ) << " sec\"," << std::endl
+             << "\"TotalTime[MatrixElems] (3)\": \""
+             << std::to_string( sumwtim ) << " sec\"," << std::endl
+             << "\"MeanTimeInMatrixElems\": \""
+             << std::to_string( meanwtim ) << " sec\"," << std::endl
+             << "\"MinTimeInMatrixElems\": \""
+             << std::to_string( minwtim ) << " sec\"," << std::endl
+             << "\"MaxTimeInMatrixElems\": \""
+             << std::to_string( maxwtim ) << " sec\"," << std::endl
+             //<< "ProcessID:                = " << getpid() << std::endl
+             //<< "NProcesses                = " << process.nprocesses << std::endl // assume nprocesses == 1 (#272 and #343)
+             << "\"TotalEventsComputed\": " << nevtALL << "," << std::endl
+             << "\"EvtsPerSec[Rnd+Rmb+ME](123)\": \""
+             << std::to_string( nevtALL / ( sumgtim + sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl
+             << "\"EvtsPerSec[Rmb+ME] (23)\": \""
+             << std::to_string( nevtALL / ( sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl
+             << "\"EvtsPerSec[MatrixElems] (3)\": \""
+             << std::to_string( nevtALL / sumwtim ) << " sec^-1\"," << std::endl
+             << "\"EvtsPerSec[MECalcOnly] (3)\": \""
+             << std::to_string( nevtALL / sumw3atim ) << " sec^-1\"," << std::endl
+             << "\"NumMatrixElems(notAbnormal)\": " << nevtALL - nabn << "," << std::endl
+             << std::scientific
+             << "\"MeanMatrixElemValue\": "
+             << "\"" << std::to_string( meanelem ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"StdErrMatrixElemValue\": "
+             << "\"" << std::to_string( stdelem / sqrt( nevtALL ) ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"StdDevMatrixElemValue\": "
+             << "\"" << std::to_string( stdelem )
+             << " GeV^" << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"MinMatrixElemValue\": "
+             << "\"" << std::to_string( minelem ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"MaxMatrixElemValue\": "
+             << "\"" << std::to_string( maxelem ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl;
+
+    timermap.dump( jsonFile, true ); // NB For the active json timer this dumps a partial total
+
+    jsonFile << "}" << std::endl;
+    jsonFile << "]";
+    jsonFile.close();
+  }
+
+  // *** STOP THE NEW TIMERS ***
+  timermap.stop();
+  if( perf )
+  {
+    std::cout << std::string( SEP79, '*' ) << std::endl;
+    timermap.dump();
+    std::cout << std::string( SEP79, '*' ) << std::endl;
+  }
+
+  // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope]
+  //std::cout << "ALL OK" << std::endl;
+  return 0;
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
new file mode 100644
index 0000000000..d2b24bba27
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
@@ -0,0 +1,418 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+%(color_matrix_lines)s
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel )    // input: number of good helicities
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      constexpr int ihel0 = 0; // the input buffer allJamps already points to a specific helicity
+      cxtype jamp = J_ACCESS::kernelAccessIcolIhelNhelConst( allJamps, icol, ihel0, nGoodHel );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      fptype2 jampRi = jampR[icol];
+      fptype2 jampIi = jampI[icol];
+      // OLD IMPLEMENTATION (ihel3: symmetric square matrix) - Loop over all jcol
+      //for( int jcol = 0; jcol < ncolor; jcol++ )
+      //{
+      //  fptype2 jampRj = jampR[jcol];
+      //  fptype2 jampIj = jampI[jcol];
+      //  ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+      //  ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      //}
+      // NEW IMPLEMENTATION #475 (ihel3p1: triangular lower diagonal matrix) - Loop over jcol < icol
+      ztempR += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampRi; // use fptype2 version of color matrix
+      ztempI += s_pNormalizedColorMatrix2[icol * ncolor + icol] * jampIi; // use fptype2 version of color matrix
+      for( int jcol = 0; jcol < icol; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += 2 * s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampRi;
+      deltaMEs += ztempI * jampIi;
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,  // output: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const fptype* allJamps, // input: jamp[2][ncolor][ihel][nevt] for one specific helicity ihel
+                    const int nhel )        // input: number of good helicities nGoodHel
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    constexpr int ihel = 0; // the input buffer allJamps already points to a specific helicity
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] =
+          allJamps[ix2 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* ghelAllMEs,           // output: allMEs super-buffer[nhel][nevt], add |M|^2 separately for each helicity
+                  const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nhel][nevt] for nhel good helicities
+                  fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nhel good helicities
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#else
+                  gpuStream_t* /*ghelStreams*/, // input: cuda streams (index is ighel: only the first nhel <= ncomb are non-null)
+#endif
+                  const int nhel,               // input: number of good helicities (nhel == nGoodHel)
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[2*ncolor*nhel*nevt] buffers and one fptype2[nhel*nevt] buffers for the nhel helicities
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp;                                         // start of first fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllJampsFpt2 = ghelAllBlasTmp + ncolor * mgOnGpu::nx2 * nhel * nevt;   // start of second fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nhel * nevt; // start of fptype2[nhel*nevt] buffer
+    // Convert jamps from double to float
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      const fptype* hAllJamps = ghelAllJamps + ighel * nevt;    // jamps for a single helicity ihel
+      fptype2* hAllJampsFpt2 = ghelAllJampsFpt2 + ighel * nevt; // jamps for a single helicity ihel
+      gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, ghelStreams[ighel], hAllJampsFpt2, hAllJamps, nhel );
+    }
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJampsFpt2;
+    const fptype2* ghelAllJampsImag = ghelAllJampsFpt2 + ncolor * nhel * nevt;
+#else
+    // Standard single or double precision mode: need one fptype2[ncolor*2*nhel*nevt] buffer
+    static_assert( std::is_same<fptype2, fptype>::value );
+    fptype2* ghelAllZtempBoth = ghelAllBlasTmp; // start of fptype2[ncolor*2*nhel*nevt] buffer
+    fptype2* ghelAllMEsFpt2 = ghelAllMEs;
+    // Real and imaginary components
+    const fptype2* ghelAllJampsReal = ghelAllJamps;                        // this is not a cast (the two types are identical)
+    const fptype2* ghelAllJampsImag = ghelAllJamps + ncolor * nhel * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* ghelAllZtempReal = ghelAllZtempBoth;
+    fptype2* ghelAllZtempImag = ghelAllZtempBoth + ncolor * nhel * nevt;
+
+    // Note: striding for cuBLAS from DeviceAccessJamp:
+    // - ghelAllJamps(icol,ihel,ievt).real is ghelAllJamps[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+    // - ghelAllJamps(icol,ihel,ievt).imag is ghelAllJamps[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt]
+
+    // Step 1: Compute Ztemp[ncolor][nhel*nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nhel*nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nhel*nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsReal, nevtN,       // JampsV is nevtN x ncolorK
+                                &beta1,
+                                ghelAllZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,                  // do not transpose ColMat
+                                GPUBLAS_OP_T,                  // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,        // ColMat is ncolorM x ncolorK
+                                ghelAllJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                ghelAllZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsReal, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column
+                                              ghelAllZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevtN (nhel*nevt) "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                     // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                     // do not transpose Tmp
+                                              1, 1, ncolor,                     // result is 1x1 (dot product)
+                                              &alpha2,
+                                              ghelAllJampsImag, nevtN, 1,       // allJamps is nevtN x ncolor, stride 1 for each ievt column (new1)
+                                              ghelAllZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevtN, with stride ncolor for each ievt column
+                                              &beta2,
+                                              ghelAllMEsFpt2, 1, 1,             // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevtN ) );                        // there are nevt (nhel*nevt) "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    for( int ighel = 0; ighel < nhel; ighel++ )
+    {
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;          // MEs for a single helicity ihel
+      fptype2* hAllMEsFpt2 = ghelAllMEsFpt2 + ighel * nevt; // MEs for a single helicity ihel      
+      gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, ghelStreams[ighel], hAllMEs, hAllMEsFpt2 );
+    }
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,               // input: number of good helicities
+                 const int gpublocks,              // input: cuda gpublocks
+                 const int gputhreads )            // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+    // CASE 1: KERNEL
+    if( !pBlasHandle )
+    {
+      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      // Loop over helicities
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        fptype* hAllMEs = ghelAllMEs + ighel * nevt;           // MEs for one specific helicity ighel
+        const fptype* hAllJamps = ghelAllJamps + ighel * nevt; // Jamps for one specific helicity ighel
+        gpuStream_t hStream = ghelStreams[ighel];
+        gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, hStream, hAllMEs, hAllJamps, nGoodHel );
+      }
+    }
+    // CASE 2: BLAS
+    else
+    {
+#ifdef MGONGPU_HAS_NO_BLAS
+      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
+#else
+      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
+      // Reset the tmp buffer
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
+#else
+      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
+#endif
+      // Delegate the color sum to BLAS for 
+      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
+#endif
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
new file mode 100644
index 0000000000..9e942d3edc
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcolIhelNhel( fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype_ref( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                         buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolIhelNhelConst( const fptype* buffer, const int icol, const int ihel, const int nhel )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // (ONE HELICITY) Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // (ONE HELICITY) New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding was used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      //return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // (ALL HELICITIES) New striding for cuBLAS: two separate ncolor*nhel*nevt matrices for each of real and imag (ievt last)
+      return cxtype( buffer[0 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt],
+                     buffer[1 * ncolor * nhel * nevt + icol * nhel * nevt + ihel * nevt + ievt] );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for one specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,         // output: allMEs[nevt], add |M|^2 for one specific helicity
+                    const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity
+                    const int nGoodHel );   // input: number of good helicities
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h
new file mode 100644
index 0000000000..342fc698c2
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLORAMPS_H
+#define COLORAMPS_H 1
+
+#include "CPPProcess.h"
+
+// Note: strictly speaking the check '#ifdef MGONGPU_SUPPORTS_MULTICHANNEL' is not needed here,
+// because coloramps.h is not included otherwise, but adding it does not harm and makes the code clearer
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+
+namespace mgOnGpu
+{
+  // Summary of numbering and indexing conventions for the relevant concepts (see issue #826 and PR #852)
+  // - Diagram number (no variable) in [1, N_diagrams]: all values are allowed (N_diagrams distinct values)
+  //   => this number is displayed for information before each block of code in CPPProcess.cc
+  // - Channel number ("channelId" in C, CHANNEL_ID in F) in [1, N_channels]: not all values are allowed (N_config <= N_channels <= N_diagrams distinct values)
+  //   *** NB channelId is a diagram number: but ALL diagrams > N_channels, and also some < N_channels, do not have an associated SDE config number (#919) ***
+  //   => this number (with F indexing as in ps/pdf output) is passed around as an API argument between cudacpp functions
+  //   Note: the old API passes around a single CHANNEL_ID (and uses CHANNEL_ID=0 to indicate no-multichannel mode, but this is not used in coloramps.h),
+  //   while the new API passes around an array of CHANNEL_ID's (and uses a NULL array pointer to indicate no-multichannel mode)
+  // - Channel number in C indexing: "channelID - 1"
+  //   => this number (with C indexing) is used as the index of the channel2iconfig array below
+  // - Config number ("iconfig" in C, ICONFIG in F) in [1, N_config]: all values are allowed (N_config <= N_channels <= N_diagrams distinct values)
+  // - Config number in C indexing: "iconfig - 1"
+  //   => this number (with C indexing) is used as the index of the icolamp array below
+
+  // The number of channels in the channel2iconfig array below
+  // *** NB this is not guaranteed to be equal to ndiagrams, it can be lower as the remaining diagrams all have no associated SDE iconfig (#919) ***
+  constexpr unsigned int nchannels = %(nb_diag)i;
+#ifdef MGONGPUCPP_GPUIMPL
+  static_assert( nchannels <= mg5amcGpu::CPPProcess::ndiagrams, "nchannels should be <= ndiagrams" ); // sanity check #910 and #919
+#else
+  static_assert( nchannels <= mg5amcCpu::CPPProcess::ndiagrams, "nchannels should be <= ndiagrams" ); // sanity check #910 and #919
+#endif
+  
+  // Map channel to iconfig (e.g. "iconfig = channel2iconfig[channelId - 1]": input index uses C indexing, output index uses F indexing)
+  // Note: iconfig=-1 indicates channels/diagrams with no associated iconfig for single-diagram enhancement in the MadEvent sampling algorithm (presence of 4-point interaction?)
+  // This array has N_diagrams elements, but only N_config <= N_diagrams valid values (iconfig>0)
+  // (NB: this array is created on the host in C++ code and on the device in GPU code, but a host copy is also needed in runTest #917)
+  __device__ constexpr int channel2iconfig[%(nb_diag)i] = { // note: a trailing comma in the initializer list is allowed
+%(channelc2iconfig_lines)s
+  };
+
+  // Host copy of the channel2iconfig array (this is needed in runTest #917)
+#ifndef MGONGPUCPP_GPUIMPL
+  constexpr const int* hostChannel2iconfig = channel2iconfig;
+#else
+  constexpr int hostChannel2iconfig[%(nb_diag)i] = { // note: a trailing comma in the initializer list is allowed
+%(channelc2iconfig_lines)s
+  };
+#endif
+
+  // The number N_config of channels/diagrams with an associated iconfig for single-diagram enhancement in the MadEvent sampling algorithm (#917)
+  constexpr unsigned int nconfigSDE = %(nb_channel)s;
+
+  // Map iconfig to the mask of allowed colors (e.g. "colormask = icolamp[iconfig - 1]": input index uses C indexing)
+  // This array has N_config <= N_diagrams elements
+  // (NB: this array is created on the host in C++ code and on the device in GPU code)
+  __device__ constexpr bool icolamp[%(nb_channel)s][%(nb_color)s] = { // note: a trailing comma in the initializer list is allowed
+%(is_LC)s
+  };
+
+}
+#endif /* clang-format on */
+
+#endif // COLORAMPS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h
new file mode 100644
index 0000000000..389f8036ed
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h
@@ -0,0 +1,334 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef constexpr_math_h
+#define constexpr_math_h 1
+
+#include "mgOnGpuConfig.h"
+
+#include <cassert>
+#include <cmath>
+#include <iomanip>
+#include <limits>
+
+// SWITCH ON/OFF DEBUGGING
+#undef CONSTEXPR_MATH_DEBUG // no-debug
+//#define CONSTEXPR_MATH_DEBUG 1 // debug
+
+// FOR DEBUGGING
+#ifdef CONSTEXPR_MATH_DEBUG
+#define CONSTEXPRMATHFUN inline
+#define CONSTEXPRMATHVAR const
+#else
+#define CONSTEXPRMATHFUN constexpr
+#define CONSTEXPRMATHVAR constexpr
+#endif
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071)
+  constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev )
+  {
+    return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr );
+  }
+  constexpr long double constexpr_sqrt( const long double xx )
+  {
+    return xx >= 0 // && x < std::numeric_limits<double>::infinity() // avoid -Wtautological-constant-compare warning in fast math
+      ? sqrtNewtonRaphson( xx, xx, 0 )
+      : std::numeric_limits<long double>::quiet_NaN();
+  }
+
+  // SQRT constants
+  constexpr long double constexpr_sqrt2 = constexpr_sqrt( 2 );
+
+  // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159)
+  constexpr int constexpr_floor( const long double xx )
+  {
+    const int i = static_cast<int>( xx );
+    return xx < i ? i - 1 : i;
+  }
+
+  // Constexpr implementation of pow
+  constexpr long double constexpr_pow( const long double base, const long double exp, const bool requireExpGe0 = false )
+  {
+    // NB(1): this iterative integer implementation of constexpr_pow requires exponent >= 0
+    if( requireExpGe0 ) assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'"
+    if( exp < 0 ) return 1. / constexpr_pow( base, -exp, true );
+    // NB(2): this iterative integer implementation of constexpr_pow requires an integer exponent, excexpt for special cases (1/2, 1/4)
+    if( exp == 0.5 ) return constexpr_sqrt( base );
+    if( exp == 0.25 ) return constexpr_sqrt( constexpr_sqrt( base ) );
+    const int iexp = constexpr_floor( exp );
+    assert( static_cast<long double>( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'"
+    // Iterative implementation of pow if exp is a non negative integer
+    return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 );
+  }
+
+  // PI constants
+  // NB1: M_PIl from from cmath is not defined on Mac
+  // NB2: std::numbers::pi needs c++20 but we are still using c++17
+  // NB3: I could use my constexpr_atan(1)*4... but a literal is better?
+  //constexpr long double constexpr_pi = M_PIl;        // pi
+  //constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2
+  //constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4
+  constexpr long double constexpr_pi = 3.141592653589793238462643383279502884L;      // same as M_PIl in gcc
+  constexpr long double constexpr_pi_by_2 = 1.570796326794896619231321691639751442L; // same as M_PI_2l in gcc
+  constexpr long double constexpr_pi_by_4 = 0.785398163397448309615660845819875721L; // same as M_PI_4l in gcc
+  static_assert( constexpr_pi_by_4 * 4 == constexpr_pi );
+  static_assert( constexpr_pi_by_4 * 2 == constexpr_pi_by_2 );
+  static_assert( constexpr_pi_by_2 * 2 == constexpr_pi );
+
+  // Constexpr implementation of sin for 0<x<pi/4 (long double signature)
+  // Taylor expansion : x - x**3/3! + x**5/5!
+  CONSTEXPRMATHFUN long double sinTaylor( const long double xx )
+  {
+#ifdef CONSTEXPR_MATH_DEBUG
+    if( xx < 0 || xx >= constexpr_pi_by_4 ) std::cout << "sinTaylor xx=" << xx << std::endl;
+#endif
+    assert( xx >= 0 && "The argument of sinTaylor is lower than the expected range [0,pi/4)" );
+    assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is higher than the expected range [0,pi/4)" );
+    long double sinx = 0;
+    int ipow = 1;
+    long double delta = xx;
+    while( true )
+    {
+      long double sinxlast = sinx;
+      sinx += delta;
+#ifdef CONSTEXPR_MATH_DEBUG
+      //std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr)
+#endif
+      if( sinx == sinxlast ) break;
+      // Next iteration
+      ipow += 2;
+      delta *= -xx * xx / ( ipow - 1 ) / ipow;
+    }
+#ifdef CONSTEXPR_MATH_DEBUG
+    //std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr)
+#endif
+    return sinx;
+  }
+
+#ifdef CONSTEXPR_MATH_DEBUG
+  // Debug printouts for trig functions
+  inline void debug_constexpr_trig( const long double xx, size_t call )
+  {
+    CONSTEXPRMATHVAR long double xxminuspi = xx - constexpr_pi;
+    CONSTEXPRMATHVAR long double twopiminusxx = 2 * constexpr_pi - xx;
+    std::cout << std::setprecision( 40 );
+    std::cout << "constexpr_sin_quad call=" << call << std::endl
+              << "  xx=" << xx << std::endl
+              << "  (xx-pi)=" << xxminuspi << std::endl
+              << "  (2pi-xx)=" << twopiminusxx << std::endl;
+    std::cout << std::setprecision( 6 );
+    if( xx < 0 ) // (-inf, 0)
+      std::cout << "  -- case 1 (xx < 0)" << std::endl;
+    else if( xx == 0 ) // [0] *** NEW
+      std::cout << "  -- case 2 (xx == 0)" << std::endl;
+    else if( xx < constexpr_pi_by_4 ) // (0, 1/4*pi)
+      std::cout << "  -- case 3 (xx < pi/4)" << std::endl;
+    else if( xx == constexpr_pi_by_4 ) // [1/4*pi] *** NEW (3rd fix #903 assert fails)
+      std::cout << "  -- case 4 (xx == pi/4)" << std::endl;
+    else if( xx < constexpr_pi_by_2 ) // (1/4*pi, 1/2*pi)
+      std::cout << "  -- case 5 (xx < pi/2)" << std::endl;
+    else if( xx == constexpr_pi_by_2 ) // [1/2*pi] ** NEW
+      std::cout << "  -- case 6 (xx == pi/2)" << std::endl;
+    else if( xx < 3 * constexpr_pi_by_4 ) // (1/2*pi, 3/4*pi)
+      std::cout << "  -- case 7 (xx < 3*pi/4)" << std::endl;
+    else if( xx == 3 * constexpr_pi_by_4 ) // [3/4*pi] ** NEW
+      std::cout << "  -- case 8 (xx == 3*pi/4)" << std::endl;
+    else if( xx < constexpr_pi ) // (3/4*pi, pi)
+      std::cout << "  -- case 9 (xx < pi)" << std::endl;
+    else if( xx == constexpr_pi ) // [pi] *** NEW (2nd fix #903 infinite recursion)
+      std::cout << "  -- case 10 (xx == pi)" << std::endl;
+    else if( xx < 2 * constexpr_pi ) // (pi, 2*pi)
+      std::cout << "  -- case 11 (xx < 2*pi)" << std::endl;
+    else if( xx == 2 * constexpr_pi ) // [2*pi] ** NEW
+      std::cout << "  -- case 12 (xx == 2*pi)" << std::endl;
+    else // (2*pi, +inf)
+      std::cout << "  -- case 13 (xx > 2*pi)" << std::endl;
+  }
+#endif
+
+  // Mapping to [0,2*pi) range (long double signature)
+  constexpr long double mapIn0to2Pi( const long double xx )
+  {
+    return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi;
+  }
+
+  // Constexpr implementation of cos (long double signature)
+  CONSTEXPRMATHFUN long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false )
+  {
+    if( assume0to2Pi )
+    {
+      assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" );
+      assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" );
+    }
+#ifdef CONSTEXPR_MATH_DEBUG
+    static size_t call = 0;
+    if( !assume0to2Pi )
+      call = 0;
+    else
+      call++;
+    if( call > 10 ) debug_constexpr_trig( xx, call );
+    assert( call < 20 );
+#endif
+    if( xx < 0 ) // (-inf, 0)
+      return constexpr_cos_quad( mapIn0to2Pi( xx ), true );
+    else if( xx == 0 ) // [0] *** NEW
+      return 1;
+    else if( xx < constexpr_pi_by_4 ) // (0, 1/4*pi)
+      return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) );
+    else if( xx == constexpr_pi_by_4 ) // [1/4*pi] *** NEW (3rd fix #903 assert fails)
+      return 1 / constexpr_sqrt2;
+    else if( xx < constexpr_pi_by_2 ) // (1/4*pi, 1/2*pi)
+      return sinTaylor( constexpr_pi_by_2 - xx );
+    else if( xx == constexpr_pi_by_2 ) // [1/2*pi] ** NEW
+      return 0;
+    else if( xx < 3 * constexpr_pi_by_4 ) // (1/2*pi, 3/4*pi)
+      return -sinTaylor( xx - constexpr_pi_by_2 );
+    else if( xx == 3 * constexpr_pi_by_4 ) // [3/4*pi] ** NEW
+      return -1 / constexpr_sqrt2;
+    else if( xx < constexpr_pi ) // (3/4*pi, pi)
+      return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) );
+    else if( xx == constexpr_pi ) // [pi] *** NEW (2nd fix #903 infinite recursion)
+      return -1;
+    else if( xx < 2 * constexpr_pi ) // (pi, 2*pi)
+      return constexpr_cos_quad( 2 * constexpr_pi - xx, true );
+    else if( xx == 2 * constexpr_pi ) // [2*pi] ** NEW
+      return 1;
+    else // (2*pi, +inf)
+      return constexpr_cos_quad( mapIn0to2Pi( xx ), true );
+  }
+
+  // Constexpr implementation of cos (double signature, internally implemented as long double)
+  CONSTEXPRMATHFUN double constexpr_cos( const double x )
+  {
+    return constexpr_cos_quad( x );
+  }
+
+  // Constexpr implementation of sin (long double signature)
+  CONSTEXPRMATHFUN long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false )
+  {
+    if( assume0to2Pi )
+    {
+      assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" );
+      assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" );
+    }
+#ifdef CONSTEXPR_MATH_DEBUG
+    static size_t call = 0;
+    if( !assume0to2Pi )
+      call = 0;
+    else
+      call++;
+    if( call > 10 ) debug_constexpr_trig( xx, call );
+    assert( call < 20 );
+#endif
+    if( xx < 0 ) // (-inf, 0)
+      return constexpr_sin_quad( mapIn0to2Pi( xx ), true );
+    else if( xx == 0 ) // [0] *** NEW
+      return 0;
+    else if( xx < constexpr_pi_by_4 ) // (0, 1/4*pi)
+      return sinTaylor( xx );
+    else if( xx == constexpr_pi_by_4 ) // [1/4*pi] *** NEW (3rd fix #903 assert fails)
+      return 1 / constexpr_sqrt2;
+    else if( xx < constexpr_pi_by_2 ) // (1/4*pi, 1/2*pi)
+      return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) );
+    else if( xx == constexpr_pi_by_2 ) // [1/2*pi] ** NEW
+      return 1;
+    else if( xx < 3 * constexpr_pi_by_4 ) // (1/2*pi, 3/4*pi)
+      return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) );
+    else if( xx == 3 * constexpr_pi_by_4 ) // [3/4*pi] ** NEW
+      return 1 / constexpr_sqrt2;
+    else if( xx < constexpr_pi ) // (3/4*pi, pi)
+      return sinTaylor( constexpr_pi - xx );
+    else if( xx == constexpr_pi ) // [pi] *** NEW (1st fix #903 infinite recursion)
+      return 0;
+    else if( xx < 2 * constexpr_pi ) // (pi, 2*pi)
+      return -constexpr_sin_quad( 2 * constexpr_pi - xx, true );
+    else if( xx == 2 * constexpr_pi ) // [2*pi] ** NEW
+      return 0;
+    else // (2*pi, +inf)
+      return constexpr_sin_quad( mapIn0to2Pi( xx ), true );
+  }
+
+  // Constexpr implementation of sin (double signature, internally implemented as long double)
+  CONSTEXPRMATHFUN double constexpr_sin( const double x )
+  {
+    return constexpr_sin_quad( x );
+  }
+
+  // Constexpr implementation of tan (long double signature)
+  CONSTEXPRMATHFUN long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false )
+  {
+    if( assume0to2Pi )
+    {
+      assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" );
+      assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" );
+    }
+    if( xx < 0 )
+      return constexpr_tan_quad( mapIn0to2Pi( xx ), true );
+    else if( xx < 2 * constexpr_pi ) // [0, 2*pi)
+      return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi );
+    else // [8/4*pi, +inf)
+      return constexpr_tan_quad( mapIn0to2Pi( xx ), true );
+  }
+
+  // Constexpr implementation of tan (double signature, internally implemented as long double)
+  CONSTEXPRMATHFUN double constexpr_tan( const double x )
+  {
+    return constexpr_tan_quad( x );
+  }
+
+  // Constexpr implementation of atan for -1<x<1 (long double signature)
+  // Taylor expansion : x - x**3/3 + x**5/5...
+  CONSTEXPRMATHFUN long double atanTaylor( const long double xx )
+  {
+    assert( xx >= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" );
+    assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" );
+    long double atanx = 0;
+    int ipow = 1;
+    long double xpow = xx;
+    while( true )
+    {
+      long double atanxlast = atanx;
+      atanx += xpow / ipow;
+#ifdef CONSTEXPR_MATH_DEBUG
+      //std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr)
+#endif
+      if( atanx == atanxlast ) break;
+      // Next iteration
+      ipow += 2;
+      xpow *= -xx * xx;
+    }
+    return atanx;
+  }
+
+  // Constexpr implementation of atan (long double signature)
+  CONSTEXPRMATHFUN long double constexpr_atan_quad( const long double xx )
+  {
+    if( xx > 1 )
+      return constexpr_pi_by_2 - atanTaylor( 1 / xx );
+    else if( xx == 1 )
+      return constexpr_pi_by_4;
+    else if( xx > -1 )
+      return atanTaylor( xx );
+    else if( xx == -1 )
+      return -constexpr_pi_by_4;
+    else // if( xx < -1 )
+      return -constexpr_pi_by_2 - atanTaylor( 1 / xx );
+  }
+
+  // Constexpr implementation of atan (double signature, internally implemented as long double)
+  CONSTEXPRMATHFUN double constexpr_atan( const double x )
+  {
+    return constexpr_atan_quad( x );
+  }
+}
+
+#endif // constexpr_math_h
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
new file mode 100644
index 0000000000..8ef58cce80
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
@@ -0,0 +1,93 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 3;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
+      if( smatrix1multi_counter[iimplC] > 0 )
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
+    return;
+  }
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc
new file mode 100644
index 0000000000..cace65e4b8
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc
@@ -0,0 +1,13 @@
+! Copyright (C) 2010 The ALOHA Development team and Contributors.
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+! Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+%(function_definitions)s
+} // end namespace
+
+#endif // HelAmps_%(model_name)s_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
new file mode 100644
index 0000000000..6f47e00573
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
@@ -0,0 +1,35 @@
+// Copyright (C) 2010 The ALOHA Development team and Contributors.
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+// This file has been automatically generated for %(output_name)s by
+%(info_lines)s
+//==========================================================================
+
+#ifndef HelAmps_%(model_name)s_H
+#define HelAmps_%(model_name)s_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "Parameters.h"
+
+#include <cassert>
+//#include <cmath>
+//#include <cstdlib>
+//#include <iomanip>
+//#include <iostream>
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+%(function_declarations)s
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
new file mode 100644
index 0000000000..9f1494512c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -0,0 +1,1242 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
+
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
+#=== NB: use 'override' to ensure that the value can not be modified from the outside
+override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)')
+
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+
+#-------------------------------------------------------------------------------
+
+#=== Include cudacpp_config.mk
+
+# Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported (and configure defaults if no user-defined choices exist)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing.
+# Determine CUDACPP_BUILDDIR from a DIRTAG based on BACKEND, FPTYPE, HELINL, HRDCOD and from the user-defined choice of USEBUILDDIR
+include ../../src/cudacpp_config.mk
+
+# Export CUDACPP_BUILDDIR (so that there is no need to check/define it again in cudacpp_src.mk)
+export CUDACPP_BUILDDIR
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname)
+
+# Detect O/S kernel (Linux, Darwin...)
+UNAME_S := $(shell uname -s)
+###$(info UNAME_S='$(UNAME_S)')
+
+# Detect architecture (x86_64, ppc64le...)
+UNAME_P := $(shell uname -p)
+###$(info UNAME_P='$(UNAME_P)')
+
+#-------------------------------------------------------------------------------
+
+#=== Include the common MG5aMC Makefile options
+
+# OM: including make_opts is crucial for MG5aMC flag consistency/documentation
+# AV: disable the inclusion of make_opts if the file has not been generated (standalone cudacpp)
+ifneq ($(wildcard ../../Source/make_opts),)
+  include ../../Source/make_opts
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Redefine BACKEND if the current value is 'cppauto'
+
+# Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available)
+ifeq ($(BACKEND),cppauto)
+  ifeq ($(UNAME_P),ppc64le)
+    override BACKEND = cppsse4
+  else ifeq ($(UNAME_P),arm)
+    override BACKEND = cppsse4
+  else ifeq ($(wildcard /proc/cpuinfo),)
+    override BACKEND = cppnone
+    ###$(warning Using BACKEND='$(BACKEND)' because host SIMD features cannot be read from /proc/cpuinfo)
+  else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
+    override BACKEND = cpp512y
+  else ifeq ($(shell grep -m1 -c avx2 /proc/cpuinfo),1)
+    override BACKEND = cppavx2
+    ###ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
+    ###  $(warning Using BACKEND='$(BACKEND)' because host does not support avx512vl)
+    ###else
+    ###  $(warning Using BACKEND='$(BACKEND)' because this is faster than avx512vl for clang)
+    ###endif
+  else ifeq ($(shell grep -m1 -c sse4_2 /proc/cpuinfo),1)
+    override BACKEND = cppsse4
+  else
+    override BACKEND = cppnone
+  endif
+  $(info BACKEND=$(BACKEND) (was cppauto))
+else
+  $(info BACKEND='$(BACKEND)')
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the C++ compiler
+
+CXXFLAGS = $(OPTFLAGS) -std=c++17 -Wall -Wshadow -Wextra
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+  CXXFLAGS += -ffast-math # see issue #117
+endif
+###CXXFLAGS+= -Ofast # performance is not different from --fast-math
+###CXXFLAGS+= -g # FOR DEBUGGING ONLY
+
+# Optionally add debug flags to display the full list of flags (eg on Darwin)
+###CXXFLAGS+= -v
+
+# Note: AR, CXX and FC are implicitly defined if not set externally
+# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
+
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+  CXXFLAGS += -mmacosx-version-min=11.3
+endif
+
+# Export CXXFLAGS (so that there is no need to check/define it again in cudacpp_src.mk)
+export CXXFLAGS
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the GPU compiler (CUDA or HIP)
+#=== (note, this is done also for C++, as NVTX and CURAND/ROCRAND are also needed by the C++ backends)
+
+# Set CUDA_HOME from the path to nvcc, if it exists
+override CUDA_HOME = $(patsubst %%/bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
+
+# Set HIP_HOME from the path to hipcc, if it exists
+override HIP_HOME = $(shell hipconfig --rocmpath)
+
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
+else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
+  override USE_NVTX=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
+endif
+###$(info USE_NVTX=$(USE_NVTX))
+
+# NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
+# - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
+# If both CUDA and HIP were installed, then CUDA took precedence over HIP, and the only way to force HIP builds was to disable
+# CUDA builds by setting CUDA_HOME to an invalid value (as CUDA_HOME took precdence over PATH to find the installation of nvcc).
+# Similarly, C++-only builds could be forced by setting CUDA_HOME and/or HIP_HOME to invalid values. A check for an invalid nvcc
+# in CUDA_HOME or an invalid hipcc HIP_HOME was necessary to ensure this logic, and had to be performed at the very beginning.
+# - In the new implementation (PR #798), separate individual builds are performed for one specific C++/AVX mode, for CUDA or
+# for HIP. The choice of the type of build is taken depending on the value of the BACKEND variable (replacing the AVX variable).
+# Unlike what happened in the past, nvcc and hipcc must have already been added to PATH. Using 'which nvcc' and 'which hipcc',
+# their existence and their location is checked, and the variables CUDA_HOME and HIP_HOME are internally set by this makefile.
+# This must be still done before backend-specific customizations, e.g. because CURAND and NVTX are also used in C++ builds.
+# Note also that a preliminary check for nvcc and hipcc if BACKEND is cuda or hip is performed in cudacpp_config.mk.
+# - Note also that the REQUIRE_CUDA variable (which was used in the past, e.g. for CI tests on GPU #443) is now (PR #798) no
+# longer necessary, as it is now equivalent to BACKEND=cuda. Similarly, there is no need to introduce a REQUIRE_HIP variable.
+
+#=== Configure the CUDA or HIP compiler (only for the CUDA and HIP backends)
+#=== (NB: throughout all makefiles, an empty GPUCC is used to indicate that this is a C++ build, i.e. that BACKEND is neither cuda nor hip!)
+
+ifeq ($(BACKEND),cuda)
+
+  # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+  # This is because it is impossible to pass this to "GPUFLAGS += -ccbin <host-compiler>" below
+  ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+    $(error BACKEND=$(BACKEND) but CUDA builds are not supported for multi-word CXX "$(CXX)")
+  endif
+
+  # Set GPUCC as $(CUDA_HOME)/bin/nvcc (it was already checked above that this exists)
+  GPUCC = $(CUDA_HOME)/bin/nvcc
+  XCOMPILERFLAG = -Xcompiler
+  GPULANGUAGE = cu
+  GPUSUFFIX = cuda
+
+  # Optimization flags
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), $(XCOMPILERFLAG) $(opt))
+
+  # NVidia CUDA architecture flags
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # This will embed device code for 70, and PTX for 70+.
+  # One may pass MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to the make command to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###GPUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###GPUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE)  # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  GPUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  GPUFLAGS += $(GPUARCHFLAGS)
+
+  # Other NVidia-specific flags
+  CUDA_OPTFLAGS = -lineinfo
+  GPUFLAGS += $(CUDA_OPTFLAGS)
+
+  # NVCC version
+  ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+
+  # Fast math
+  GPUFLAGS += -use_fast_math
+
+  # Extra build warnings
+  GPUFLAGS += $(XCOMPILERFLAG) -Wunused-parameter
+  ###GPUFLAGS += $(XCOMPILERFLAG) -Wall $(XCOMPILERFLAG) -Wextra $(XCOMPILERFLAG) -Wshadow
+
+  # CUDA includes and NVTX
+  GPUFLAGS += $(CUDA_INC) $(USE_NVTX) 
+
+  # C++ standard
+  GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+
+  # For nvcc, use -maxrregcount to control the maximum number of registries (this does not exist in hipcc)
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+
+  # Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+  # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+  GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+  # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+  ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+    GPUFLAGS += -allow-unsupported-compiler
+  endif
+
+else ifeq ($(BACKEND),hip)
+
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
+  # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
+  GPUCC = $(HIP_HOME)/bin/hipcc
+  XCOMPILERFLAG =
+  GPULANGUAGE = hip
+  GPUSUFFIX = hip
+
+  # Optimization flags
+  override OPTFLAGS = -O2 # work around "Memory access fault" in gq_ttq for HIP #806: disable hipcc -O3 optimizations
+  GPUFLAGS = $(foreach opt, $(OPTFLAGS), $(XCOMPILERFLAG) $(opt))
+
+  # DEBUG FLAGS (for #806: see https://hackmd.io/@gmarkoma/lumi_finland)
+  ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
+
+  # AMD HIP architecture flags
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
+  GPUFLAGS += $(GPUARCHFLAGS)
+
+  # Other AMD-specific flags
+  GPUFLAGS += -target x86_64-linux-gnu -DHIP_PLATFORM=amd
+
+  # Fast math (is -DHIP_FAST_MATH equivalent to -ffast-math?)
+  GPUFLAGS += -DHIP_FAST_MATH
+
+  # Extra build warnings
+  ###GPUFLAGS += $(XCOMPILERFLAG) -Wall $(XCOMPILERFLAG) -Wextra $(XCOMPILERFLAG) -Wshadow
+
+  # HIP includes
+  HIP_INC = -I$(HIP_HOME)/include/
+  GPUFLAGS += $(HIP_INC)
+
+  # C++ standard
+  GPUFLAGS += -std=c++17
+
+else
+
+  # Backend is neither cuda nor hip
+  override GPUCC=
+  override GPUFLAGS=
+
+  # Sanity check, this should never happen: if GPUCC is empty, then this is a C++ build, i.e. BACKEND is neither cuda nor hip.
+  # In practice, in the following, "ifeq ($(GPUCC),)" is equivalent to "ifneq ($(findstring cpp,$(BACKEND)),)".
+  # Conversely, note that GPUFLAGS is non-empty also for C++ builds, but it is never used in that case.
+  ifeq ($(findstring cpp,$(BACKEND)),)
+    $(error INTERNAL ERROR! Unknown backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
+  endif
+
+endif
+
+# Export GPUCC, GPUFLAGS, GPULANGUAGE, GPUSUFFIX (so that there is no need to check/define them again in cudacpp_src.mk)
+export GPUCC
+export GPUFLAGS
+export GPULANGUAGE
+export GPUSUFFIX
+
+#-------------------------------------------------------------------------------
+
+#=== Configure ccache for C++ and CUDA/HIP builds
+
+# Enable ccache only if USECCACHE=1
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+  override CXX:=ccache $(CXX)
+endif
+#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
+#  override AR:=ccache $(AR)
+#endif
+ifneq ($(GPUCC),)
+  ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1)
+    override GPUCC:=ccache $(GPUCC)
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA/HIP
+
+INCFLAGS = -I.
+OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+
+# Dependency on src directory
+ifeq ($(GPUCC),)
+MG5AMC_COMMONLIB = mg5amc_common_cpp
+else
+MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX)
+endif
+LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+INCFLAGS += -I../../src
+
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+  override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+  override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+  override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+  override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+
+# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk)
+export CXXNAMESUFFIX
+
+# Dependency on test directory
+# Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
+# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
+# Do not build the tests if USEGTEST is equal to 0 (default inside launch_plugin.py, see https://github.com/madgraph5/madgraph4gpu/issues/878)
+###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation
+###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation
+TESTDIRCOMMON = ../../../../../test
+TESTDIRLOCAL = ../../test
+ifeq ($(USEGTEST),0)
+  TESTDIR=
+  GTEST_ROOT=
+else ifneq ($(wildcard $(GTEST_ROOT)),)
+  TESTDIR=
+else ifneq ($(LOCALGTEST),)
+  TESTDIR=$(TESTDIRLOCAL)
+  GTEST_ROOT=$(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
+else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
+  TESTDIR=$(TESTDIRCOMMON)
+  GTEST_ROOT= $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
+else
+  TESTDIR=
+endif
+ifneq ($(GTEST_ROOT),)
+  GTESTLIBDIR = $(GTEST_ROOT)/lib64/
+  GTESTLIBS = $(GTESTLIBDIR)/libgtest.a
+  GTESTINC = -I$(GTEST_ROOT)/include
+else
+  GTESTLIBDIR =
+  GTESTLIBS =
+  GTESTINC =
+endif
+###$(info GTEST_ROOT = $(GTEST_ROOT))
+###$(info LOCALGTEST = $(LOCALGTEST))
+###$(info TESTDIR = $(TESTDIR))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP
+
+# PowerPC-specific CXX compiler flags (being reviewed)
+ifeq ($(UNAME_P),ppc64le)
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for cppnone and cppsse4
+  # Throughput references without the extra flags below: cppnone=1.41-1.42E6, cppsse4=2.15-2.19E6
+  ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
+  ###CXXFLAGS+= -fpeel-loops # no change
+  ###CXXFLAGS+= -funroll-loops # gains ~1%% for cppnone, loses ~1%% for cppsse4
+  ###CXXFLAGS+= -ftree-vectorize # no change
+  ###CXXFLAGS+= -flto # would increase to cppnone=4.08-4.12E6, cppsse4=4.99-5.03E6!
+else
+  ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4...
+  ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
+endif
+
+# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!)
+ifeq ($(UNAME_P),ppc64le)
+  GPUFLAGS+= $(XCOMPILERFLAG) -mno-float128
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults for OMPFLAGS
+
+# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ###$(info USEOPENMP==1: will build with OpenMP if possible)
+  ifneq ($(findstring hipcc,$(GPUCC)),)
+    override OMPFLAGS = # disable OpenMP MT when using hipcc #802
+  else ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578)
+  else ifneq ($(shell $(CXX) --version | egrep '^clang version 16'),)
+    ###override OMPFLAGS = # disable OpenMP on clang16 #904
+    $(error OpenMP is not supported by cudacpp on clang16 - issue #904)
+  else ifneq ($(shell $(CXX) --version | egrep '^clang version 17'),)
+    ###override OMPFLAGS = # disable OpenMP on clang17 #904
+    $(error OpenMP is not supported by cudacpp on clang17 - issue #904)
+  else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
+    override OMPFLAGS = -fopenmp
+    ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+  ###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler)
+  else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler)
+    override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578)
+    ###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI)
+  else
+    override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms
+    ###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575)
+  endif
+else
+  ###$(info USEOPENMP!=1: will build without OpenMP)
+  override OMPFLAGS =
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND
+
+# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out)
+###$(info RNDGEN=$(RNDGEN))
+ifneq ($(RNDGEN),)
+  $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!)
+  ifeq ($(RNDGEN),hasCurand)
+    override HASCURAND = $(RNDGEN)
+  else ifeq ($(RNDGEN),hasNoCurand)
+    override HASCURAND = $(RNDGEN)
+  else ifneq ($(RNDGEN),hasNoCurand)
+    $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!)
+  endif
+endif
+
+# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND
+# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
+ifeq ($(HASCURAND),)
+  ifeq ($(GPUCC),) # CPU-only build
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
+      # By default, assume that curand is installed if a CUDA installation exists
+      override HASCURAND = hasCurand
+    endif
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
+    override HASCURAND = hasCurand
+  else # non-Nvidia GPU build
+    override HASCURAND = hasNoCurand
+  endif
+endif
+
+# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND
+# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...)
+ifeq ($(HASHIPRAND),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASHIPRAND = hasNoHiprand
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override HASHIPRAND = hasHiprand
+  else # non-AMD GPU build
+    override HASHIPRAND = hasNoHiprand
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
+
+# Set the build flags appropriate to OMPFLAGS
+$(info OMPFLAGS=$(OMPFLAGS))
+CXXFLAGS += $(OMPFLAGS)
+
+# Set the build flags appropriate to each BACKEND choice (example: "make BACKEND=cppnone")
+# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
+# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
+ifeq ($(UNAME_P),ppc64le)
+  ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers)
+  else ifeq ($(BACKEND),cppavx2)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  else ifeq ($(BACKEND),cpp512y)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  else ifeq ($(BACKEND),cpp512z)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on PowerPC for the moment)
+  endif
+else ifeq ($(UNAME_P),arm)
+  ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers)
+  else ifeq ($(BACKEND),cppavx2)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  else ifeq ($(BACKEND),cpp512y)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  else ifeq ($(BACKEND),cpp512z)
+    $(error Invalid SIMD BACKEND='$(BACKEND)': only 'cppnone' and 'cppsse4' are supported on ARM for the moment)
+  endif
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+  ifeq ($(BACKEND),cppnone)
+    override AVXFLAGS = -mno-sse3 # no SIMD
+  else ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(BACKEND),cppavx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(BACKEND),cpp512y)
+    override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(BACKEND),cpp512z)
+    override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  endif
+else
+  ifeq ($(BACKEND),cppnone)
+    override AVXFLAGS = -march=x86-64 # no SIMD (see #588)
+  else ifeq ($(BACKEND),cppsse4)
+    override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(BACKEND),cppavx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(BACKEND),cpp512y)
+    override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(BACKEND),cpp512z)
+    override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  endif
+endif
+# For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
+ifeq ($(GPUCC),)
+  CXXFLAGS+= $(AVXFLAGS)
+endif
+
+# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
+$(info FPTYPE='$(FPTYPE)')
+ifeq ($(FPTYPE),d)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+else ifeq ($(FPTYPE),f)
+  CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+else ifeq ($(FPTYPE),m)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+else
+  $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
+endif
+
+# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1")
+$(info HELINL='$(HELINL)')
+ifeq ($(HELINL),1)
+  CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
+  GPUFLAGS += -DMGONGPU_INLINE_HELAMPS
+else ifneq ($(HELINL),0)
+  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
+$(info HRDCOD='$(HRDCOD)')
+ifeq ($(HRDCOD),1)
+  CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
+  GPUFLAGS += -DMGONGPU_HARDCODE_PARAM
+else ifneq ($(HRDCOD),0)
+  $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
+endif
+
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND
+
+$(info HASCURAND=$(HASCURAND))
+$(info HASHIPRAND=$(HASHIPRAND))
+override RNDCXXFLAGS=
+override RNDLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand")
+ifeq ($(HASCURAND),hasNoCurand)
+  override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND
+else ifeq ($(HASCURAND),hasCurand)
+  override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+else
+  $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported)
+endif
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand")
+ifeq ($(HASHIPRAND),hasNoHiprand)
+  override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND
+else ifeq ($(HASHIPRAND),hasHiprand)
+  override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand
+else ifneq ($(HASHIPRAND),hasHiprand)
+  $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported)
+endif
+
+#$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
+#$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
+
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure Position-Independent Code
+CXXFLAGS += -fPIC
+GPUFLAGS += $(XCOMPILERFLAG) -fPIC
+
+#-------------------------------------------------------------------------------
+
+#=== Configure channelid debugging
+ifneq ($(MG5AMC_CHANNELID_DEBUG),)
+  CXXFLAGS += -DMGONGPU_CHANNELID_DEBUG
+  GPUFLAGS += -DMGONGPU_CHANNELID_DEBUG
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed)
+# (Rationale: avoid mixing of builds with different random number generators)
+override TAG = $(patsubst cpp%%,%%,$(BACKEND))_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND)
+
+# Export TAG (so that there is no need to check/define it again in cudacpp_src.mk)
+export TAG
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+override BUILDDIR = $(CUDACPP_BUILDDIR)
+ifeq ($(USEBUILDDIR),1)
+  override LIBDIR = ../../lib/$(BUILDDIR)
+  override LIBDIRRPATH = '$$ORIGIN/../$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR == 1))
+else
+  override LIBDIR = ../../lib
+  override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR != 1))
+endif
+###override INCDIR = ../../include
+###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override CXXLIBFLAGSRPATH =
+  override GPULIBFLAGSRPATH =
+  override CXXLIBFLAGSRPATH2 =
+  override GPULIBFLAGSRPATH2 =
+else
+  # RPATH to gpu/cpp libs when linking executables
+  override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH)
+  override GPULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH)
+  # RPATH to common lib when linking gpu/cpp libs
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN'
+  override GPULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN'
+endif
+
+# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
+override RUNTIME =
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+
+ifeq ($(GPUCC),)
+  cxx_checkmain=$(BUILDDIR)/check_cpp.exe
+  cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe
+  cxx_testmain=$(BUILDDIR)/runTest_cpp.exe
+else
+  gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe
+  gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe
+  gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all.$(TAG)
+
+# First target (default goal)
+ifeq ($(GPUCC),)
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain))
+else
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain))
+endif
+
+# Target (and build options): debug
+MAKEDEBUG=
+debug: OPTFLAGS = -g -O0
+debug: CUDA_OPTFLAGS = -G
+debug: MAKEDEBUG := debug
+debug: all.$(TAG)
+
+# Target (and build options): address sanitizer #207
+###CXXLIBFLAGSASAN =
+###GPULIBFLAGSASAN =
+###asan: OPTFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer
+###asan: CUDA_OPTFLAGS = -G $(XCOMPILERFLAG) -fsanitize=address $(XCOMPILERFLAG) -fno-omit-frame-pointer
+###asan: CXXLIBFLAGSASAN = -fsanitize=address
+###asan: GPULIBFLAGSASAN = -Xlinker -fsanitize=address -Xlinker -shared
+###asan: MAKEDEBUG := debug
+###asan: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+$(BUILDDIR)/.build.$(TAG):
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo "  $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+# Apply special build flags only to CrossSectionKernel_<cpp|$(GPUSUFFIX)>.o (no fast math, see #117 and #516)
+# Added edgecase for HIP compilation
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS))
+$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math
+$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math
+endif
+
+# Apply special build flags only to check_sa_<cpp|$(GPUSUFFIX)>.o (NVTX in timermap.h, #679)
+$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+$(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC)
+
+# Apply special build flags only to check_sa_<cpp|$(GPUSUFFIX)>.o and (Cu|Hip)randRandomNumberKernel_<cpp|$(GPUSUFFIX)>.o
+$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS)
+$(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS)
+ifeq ($(HASCURAND),hasCurand) # curand headers, #679
+$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC)
+endif
+ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers
+$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC)
+endif
+
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592)
+ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
+ifneq ($(GPUCC),)
+GPUFLAGS += -Wno-deprecated-builtins
+endif
+endif
+
+# Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516)
+# This patch does remove the warning, but I prefer to keep it disabled for the moment...
+###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
+###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option
+###ifneq ($(GPUCC),)
+###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option
+###endif
+###endif
+
+#### Apply special build flags only to CPPProcess.o (-flto)
+###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto
+
+#### Apply special build flags only to CPPProcess.o (AVXFLAGS)
+###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS)
+
+# Generic target and build rules: objects from C++ compilation
+# (NB do not include CUDA_INC here! add it only for NVTX or curand #679)
+$(BUILDDIR)/%%_cpp.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@
+
+# Generic target and build rules: objects from CUDA or HIP compilation
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): common (src) library
+commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc $(BUILDDIR)/.build.$(TAG)
+	$(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE)
+
+#-------------------------------------------------------------------------------
+
+processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+###$(info processid_short=$(processid_short))
+
+MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o 
+cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
+
+ifneq ($(GPUCC),)
+MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o 
+gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
+endif
+
+# Target (and build rules): C++ and CUDA/HIP shared libraries
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
+	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+
+ifneq ($(GPUCC),)
+$(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
+$(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
+$(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
+# Bypass std::filesystem completely to ease portability on LUMI #803
+#ifneq ($(findstring hipcc,$(GPUCC)),)
+#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+#else
+#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+#endif
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): Fortran include files
+###$(INCDIR)/%%.inc : ../%%.inc
+###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
+###	\cp $< $@
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): C++ and CUDA/HIP standalone executables
+###$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSASAN)
+$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o
+	$(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS)
+
+ifneq ($(GPUCC),)
+###$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSASAN)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
+$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
+	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Generic target and build rules: objects from Fortran compilation
+# (NB In this makefile, this only applies to fcheck_sa_fortran.o)
+# (NB -fPIC was added to fix clang16 build #904, but this seems better for other cases too and is consistent to c++ and cuda builds)
+$(BUILDDIR)/%%_fortran.o : %%.f *.inc
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(FC) -I. -fPIC -c $< -o $@
+
+# Generic target and build rules: objects from Fortran compilation
+###$(BUILDDIR)/%%_fortran.o : %%.f *.inc
+###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
+###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+###	$(FC) -I. -I$(INCDIR) -c $< -o $@
+
+# Target (and build rules): Fortran standalone executables
+###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc
+
+###$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSASAN)
+ifeq ($(UNAME_S),Darwin)
+$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
+endif
+$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
+ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++
+else
+	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe)
+endif
+
+ifneq ($(GPUCC),)
+###$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSASAN)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+endif
+ifeq ($(UNAME_S),Darwin)
+$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
+endif
+$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
+$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
+ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
+else
+	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
+endif
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): test objects and test executable
+ifeq ($(GPUCC),)
+$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS)
+$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC)
+$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt
+$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o
+$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions
+else
+$(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS)
+$(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC)
+$(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt
+$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o
+$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions
+endif
+
+ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838)
+ifeq ($(GPUCC),)
+$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS)
+$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC)
+$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o
+$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests
+else
+$(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS)
+$(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC)
+$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o
+$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests
+endif
+endif
+
+ifeq ($(GPUCC),)
+$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS)
+$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC)
+$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o
+$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o
+else
+$(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS)
+$(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy')
+$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(gpu_testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o
+$(gpu_testmain): gpu_objects_exe  += $(BUILDDIR)/runTest_$(GPUSUFFIX).o
+endif
+
+ifeq ($(GPUCC),)
+$(cxx_testmain): $(GTESTLIBS)
+$(cxx_testmain): INCFLAGS +=  $(GTESTINC)
+$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc
+else
+$(gpu_testmain): $(GTESTLIBS)
+$(gpu_testmain): INCFLAGS +=  $(GTESTINC)
+$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc
+endif
+
+ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds)
+ifneq ($(OMPFLAGS),)
+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604)
+else
+$(cxx_testmain): LIBFLAGS += -lgomp
+endif
+endif
+endif
+
+# Test quadmath in testmisc.cc tests for constexpr_math #627
+###ifeq ($(GPUCC),)
+###$(cxx_testmain): LIBFLAGS += -lquadmath
+###else
+###$(gpu_testmain): LIBFLAGS += -lquadmath
+###endif
+
+# Bypass std::filesystem completely to ease portability on LUMI #803
+###ifneq ($(findstring hipcc,$(GPUCC)),)
+###$(gpu_testmain): LIBFLAGS += -lstdc++fs
+###endif
+
+ifeq ($(GPUCC),) # link only runTest_cpp.o
+###$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSASAN)
+$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
+	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
+else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
+###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
+$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
+$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
+ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
+else
+	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
+endif
+endif
+
+# Use target gtestlibs to build only googletest
+ifneq ($(GTESTLIBS),)
+gtestlibs: $(GTESTLIBS)
+endif
+
+# Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
+$(GTESTLIBS):
+ifneq ($(shell which flock 2>/dev/null),)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
+else
+	if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: build all targets in all BACKEND modes (each BACKEND mode in a separate build directory)
+# Split the bldall target into separate targets to allow parallel 'make -j bldall' builds
+# (Obsolete hack, no longer needed as there is no INCDIR: add a fbridge.inc dependency to bldall, to ensure it is only copied once for all BACKEND modes)
+bldcuda:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda -f $(CUDACPP_MAKEFILE)
+
+bldhip:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip -f $(CUDACPP_MAKEFILE)
+
+bldnone:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone -f $(CUDACPP_MAKEFILE)
+
+bldsse4:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4 -f $(CUDACPP_MAKEFILE)
+
+bldavx2:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2 -f $(CUDACPP_MAKEFILE)
+
+bld512y:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y -f $(CUDACPP_MAKEFILE)
+
+bld512z:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
+
+ifeq ($(UNAME_P),ppc64le)
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
+bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
+bldavxs: bldnone bldsse4
+else
+###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavx2 bld512y bld512z
+bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(HIP_HOME),)
+ifneq ($(CUDA_HOME),)
+bldall: bldhip bldcuda bldavxs
+else
+bldall: bldhip bldavxs
+endif
+else
+ifneq ($(CUDA_HOME),)
+bldall: bldcuda bldavxs
+else
+bldall: bldavxs
+endif
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean
+
+clean:
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
+	rm -f $(LIBDIR)/lib*.so
+endif
+	$(MAKE) -C ../../src clean -f $(CUDACPP_SRC_MAKEFILE)
+###	rm -rf $(INCDIR)
+
+cleanall:
+	@echo
+	$(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE)
+	@echo
+	$(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE)
+	rm -rf build.*
+
+# Target: clean the builds as well as the gtest installation(s)
+distclean: cleanall
+ifneq ($(wildcard $(TESTDIRCOMMON)),)
+	$(MAKE) -C $(TESTDIRCOMMON) clean
+endif
+	$(MAKE) -C $(TESTDIRLOCAL) clean
+
+#-------------------------------------------------------------------------------
+
+# Target: show system and compiler information
+info:
+	@echo ""
+	@uname -spn # e.g. Linux nodename.cern.ch x86_64
+ifeq ($(UNAME_S),Darwin)
+	@sysctl -a | grep -i brand
+	@sysctl -a | grep machdep.cpu | grep features || true
+	@sysctl -a | grep hw.physicalcpu:
+	@sysctl -a | grep hw.logicalcpu:
+else
+	@cat /proc/cpuinfo | grep "model name" | sort -u
+	@cat /proc/cpuinfo | grep "flags" | sort -u
+	@cat /proc/cpuinfo | grep "cpu cores" | sort -u
+	@cat /proc/cpuinfo | grep "physical id" | sort -u
+endif
+	@echo ""
+ifneq ($(shell which nvidia-smi 2>/dev/null),)
+	nvidia-smi -L
+	@echo ""
+endif
+	@echo USECCACHE=$(USECCACHE)
+ifeq ($(USECCACHE),1)
+	ccache --version | head -1
+endif
+	@echo ""
+	@echo GPUCC=$(GPUCC)
+ifneq ($(GPUCC),)
+	$(GPUCC) --version
+endif
+	@echo ""
+	@echo CXX=$(CXX)
+ifneq ($(shell $(CXX) --version | grep ^clang),)
+	@echo $(CXX) -v
+	@$(CXX) -v |& egrep -v '(Found|multilib)'
+	@readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}'
+else
+	$(CXX) --version
+endif
+	@echo ""
+	@echo FC=$(FC)
+	$(FC) --version
+
+#-------------------------------------------------------------------------------
+
+# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe)
+# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!]
+# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe']
+test: runTest cmpFcheck
+
+# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe)
+runTest: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/runTest_cpp.exe
+else
+	$(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe
+endif
+
+# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events)
+runCheck: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2
+else
+	$(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
+endif
+
+# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events)
+runFcheck: all.$(TAG)
+ifeq ($(GPUCC),)
+	$(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2
+else
+	$(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2
+endif
+
+# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events)
+cmpFcheck: all.$(TAG)
+	@echo
+ifeq ($(GPUCC),)
+	@echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+else
+	@echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU)   = $${me1}"; echo "Avg ME (F77/GPU)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+endif
+
+# Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
+cuda-memcheck: all.$(TAG)
+	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2
+
+#-------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
new file mode 100644
index 0000000000..b57e56d182
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
@@ -0,0 +1,97 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: A. Valassi (Mar 2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+
+#-------------------------------------------------------------------------------
+
+#=== Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
+#=== Configure default values for these variables if no user-defined choices exist
+
+# Set the default BACKEND (CUDA, HIP or C++/SIMD) choice
+ifeq ($(BACKEND),)
+  override BACKEND = cppauto
+endif
+
+# Set the default FPTYPE (floating point type) choice
+# NB: this only affects manual 'make' builds (madevent 'launch' builds are controlled by floating_type in run_card.dat)
+ifeq ($(FPTYPE),)
+  # OLD DEFAULT UP TO v1.00.00 INCLUDED (inconsistent with default floating_type='m' in run_card.dat)
+  ###override FPTYPE = d
+  # NEW DEFAULT (#995) AS OF v1.00.01 (now consistent with default floating_type='m' in run_card.dat)
+  override FPTYPE = m
+endif
+
+# Set the default HELINL (inline helicities?) choice
+ifeq ($(HELINL),)
+  override HELINL = 0
+endif
+
+# Set the default HRDCOD (hardcode cIPD physics parameters?) choice
+ifeq ($(HRDCOD),)
+  override HRDCOD = 0
+endif
+
+# Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
+# (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
+  $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
+endif
+
+override SUPPORTED_FPTYPES = d f m
+ifneq ($(words $(filter $(FPTYPE), $(SUPPORTED_FPTYPES))),1)
+  $(error Invalid fptype FPTYPE='$(FPTYPE)': supported fptypes are $(foreach fptype,$(SUPPORTED_FPTYPES),'$(fptype)'))
+endif
+
+override SUPPORTED_HELINLS = 0 1
+ifneq ($(words $(filter $(HELINL), $(SUPPORTED_HELINLS))),1)
+  $(error Invalid helinl HELINL='$(HELINL)': supported helinls are $(foreach helinl,$(SUPPORTED_HELINLS),'$(helinl)'))
+endif
+
+override SUPPORTED_HRDCODS = 0 1
+ifneq ($(words $(filter $(HRDCOD), $(SUPPORTED_HRDCODS))),1)
+  $(error Invalid hrdcod HRDCOD='$(HRDCOD)': supported hrdcods are $(foreach hrdcod,$(SUPPORTED_HRDCODS),'$(hrdcod)'))
+endif
+
+# Print out BACKEND, FPTYPE, HELINL, HRDCOD
+###$(info BACKEND='$(BACKEND)')
+###$(info FPTYPE='$(FPTYPE)')
+###$(info HELINL='$(HELINL)')
+###$(info HRDCOD='$(HRDCOD)')
+
+#-------------------------------------------------------------------------------
+
+# Stop immediately if BACKEND=cuda but nvcc is missing
+ifeq ($(BACKEND),cuda)
+  ifeq ($(shell which nvcc 2>/dev/null),)
+    $(error BACKEND=$(BACKEND) but nvcc was not found)
+  endif
+endif
+
+# Stop immediately if BACKEND=hip but hipcc is missing
+ifeq ($(BACKEND),hip)
+  ifeq ($(shell which hipcc 2>/dev/null),)
+    $(error BACKEND=$(BACKEND) but hipcc was not found)
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure CUDACPP_BUILDDIR
+
+# Build directory "short" tag (defines target and path to the optional build directory)
+# (Rationale: keep directory names shorter, e.g. do not include random number generator choice)
+# ** NB: using ':=' here ensures that 'cppauto' is used as such before being changed later on!
+override DIRTAG := $(patsubst cpp%,%,$(BACKEND))_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+ifeq ($(USEBUILDDIR),1)
+  override CUDACPP_BUILDDIR = build.$(DIRTAG)
+else
+  override CUDACPP_BUILDDIR = .
+endif
+###$(info USEBUILDDIR='$(USEBUILDDIR)')
+###$(info CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)')
+
+#-------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk
new file mode 100644
index 0000000000..adbfcad2bf
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk
@@ -0,0 +1,295 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: D. Massaro (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Based on code originally written by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024)
+
+# To be used after the project makefile
+SHELL := /bin/bash
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
new file mode 100644
index 0000000000..879b3a15d1
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk
@@ -0,0 +1,185 @@
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: assume that the same name (e.g. cudacpp.mk, Makefile...) is used in the Subprocess and src directories
+
+THISMK = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for CUDA and C++
+
+INCFLAGS = -I.
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the C++ compiler (note: CXXFLAGS has been exported from cudacpp.mk)
+
+###$(info CXXFLAGS=$(CXXFLAGS))
+
+# Note: AR, CXX and FC are implicitly defined if not set externally
+# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
+###RANLIB = ranlib
+
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+LDFLAGS =
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+  LDFLAGS += -mmacosx-version-min=11.3
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the GPU (CUDA or HIP) compiler (note: GPUCC including ccache, GPUFLAGS, GPULANGUAGE, GPUSUFFIX have been exported from cudacpp.mk)
+
+###$(info GPUCC=$(GPUCC))
+###$(info GPUFLAGS=$(GPUFLAGS))
+###$(info GPULANGUAGE=$(GPULANGUAGE))
+###$(info GPUSUFFIX=$(GPUSUFFIX))
+
+#-------------------------------------------------------------------------------
+
+#=== Configure ccache for C++ builds (note: GPUCC has been exported from cudacpp.mk including ccache)
+
+# Enable ccache if USECCACHE=1
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+  override CXX:=ccache $(CXX)
+endif
+#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
+#  override AR:=ccache $(AR)
+#endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Use the build directory exported from cudacpp.mk
+###$(info CUDACPP_BUILDDIR=$(CUDACPP_BUILDDIR))
+
+# Use the build lockfile "full" tag exported from cudacpp.mk
+###$(info TAG=$(TAG))
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+###$(info Current directory is $(shell pwd))
+override BUILDDIR = $(CUDACPP_BUILDDIR)
+ifeq ($(USEBUILDDIR),1)
+  override LIBDIRREL = ../lib/$(BUILDDIR)
+  ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR=1 is set))
+else
+  override LIBDIRREL = ../lib
+  ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set))
+endif
+######$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# Workaround for Mac #375 (I did not manage to fix rpath with @executable_path): use absolute paths for LIBDIR
+# (NB: this is quite ugly because it creates the directory if it does not exist - to avoid removing src by mistake)
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+  override LIBDIR = $(shell mkdir -p $(LIBDIRREL); cd $(LIBDIRREL); pwd)
+  ifeq ($(wildcard $(LIBDIR)),)
+    $(error Directory LIBDIR="$(LIBDIR)" should have been created by now)
+  endif
+else
+  override LIBDIR = $(LIBDIRREL)
+endif
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+# NB1: there are no CUDA targets in src as we avoid RDC!
+# NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src!
+
+ifeq ($(GPUCC),)
+MG5AMC_COMMONLIB = mg5amc_common_cpp
+else
+MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX)
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all.$(TAG)
+
+# First target (default goal)
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+
+# Target (and build options): debug
+debug: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+override oldtagsl=`if [ -d $(LIBDIR) ]; then find $(LIBDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+
+$(BUILDDIR)/.build.$(TAG): $(LIBDIR)/.build.$(TAG)
+
+$(LIBDIR)/.build.$(TAG):
+	@if [ "$(oldtagsl)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(LIBDIR) for other tags:\n$(oldtagsl)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make cleanall'"; exit 1; fi
+	@if [ "$(oldtagsb)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(BUILDDIR) for other tags:\n$(oldtagsb)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make cleanall'"; exit 1; fi
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	@touch $(LIBDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+#-------------------------------------------------------------------------------
+
+# Generic target and build rules: objects from C++ compilation
+$(BUILDDIR)/%%_cpp.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@
+
+# Generic target and build rules: objects from CUDA compilation
+ifneq ($(GPUCC),)
+$(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
+endif
+
+#-------------------------------------------------------------------------------
+
+cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o)
+ifeq ($(GPUCC),)
+  cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_cpp.o)
+else
+  gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_$(GPUSUFFIX).o)
+endif
+
+# Target (and build rules): common (src) library
+ifeq ($(GPUCC),)
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	$(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS)
+else
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects)
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	$(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean
+
+clean:
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(LIBDIR)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(LIBDIR)/.build.* $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
+endif
+
+cleanall:
+	@echo
+	$(MAKE) clean -f $(THISMK)
+	@echo
+	rm -rf $(LIBDIR)/build.*
+	rm -rf build.*
+
+#-------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
new file mode 100644
index 0000000000..48b2037dc2
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
@@ -0,0 +1,50 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
+
+THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+# Host detection
+UNAME_S := $(shell uname -s)
+
+# Only add AVX2/FMA on non-mac hosts
+ifeq ($(UNAME_S),Darwin)
+  GTEST_CMAKE_FLAGS :=
+else
+  GTEST_CMAKE_FLAGS := -DCMAKE_CXX_FLAGS="-mavx2 -mfma"
+endif
+
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
+CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
+
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
+
+googletest/CMakeLists.txt:
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
+
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install $(GTEST_CMAKE_FLAGS) -DBUILD_GMOCK=OFF ../
+
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
+
+# NB 'make install' is no longer supported in googletest (issue 328)
+# NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
+
+clean:
+	rm -rf googletest
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/epoch_process_id.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/epoch_process_id.h
new file mode 100644
index 0000000000..5214cdb32c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/epoch_process_id.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Oct 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef EPOCH_PROCESS_ID_H
+#define EPOCH_PROCESS_ID_H 1
+
+// No need to indicate EPOCHX_ any longer for auto-generated code
+// However, keep the name of the file as it may be useful again for new manual developments
+#define MG_EPOCH_PROCESS_ID %(processid_uppercase)s
+
+// For simplicity, define here the name of the process-dependent reference file for tests
+#define MG_EPOCH_REFERENCE_FILE_NAME "../../test/ref/dump_CPUTest.%(processid)s.txt"
+
+#endif // EPOCH_PROCESS_ID_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
new file mode 100644
index 0000000000..a06c3a03aa
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -0,0 +1,160 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, S. Roiser, J. Teig, A. Thete, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+extern "C"
+{
+  /**
+   * The namespace where the Bridge class is taken from.
+   *
+   * In the current implementation, two separate shared libraries are created for the GPU/CUDA and CPU/C++ implementations.
+   * Actually, two shared libraries for GPU and CPU are created for each of the five SIMD implementations on CPUs (none, sse4, avx2, 512y, 512z).
+   * A single fcreatebridge_ symbol is created in each library with the same name, connected to the appropriate Bridge on CPU or GPU.
+   * The Fortran MadEvent code is always the same: the choice whether to use a CPU or GPU implementation is done by linking the appropriate library.
+   * As the names of the two CPU/GPU libraries are the same in the five SIMD implementations, the choice of SIMD is done by setting LD_LIBRARY_PATH.
+   *
+   * In a future implementation, a single heterogeneous shared library may be created, with the same interface.
+   * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
+   * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
+   */
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  /**
+   * The floating point precision used in Fortran arrays.
+   * This is presently hardcoded to double precision (REAL*8).
+   */
+  using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays
+  //using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays
+
+  /**
+   * Create a Bridge and return its pointer.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param nevtF the pointer to the number of events in the Fortran arrays
+   * @param nparF the pointer to the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F the pointer to the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   */
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::setUp();
+#endif
+    // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
+    // FIXME: disable OMP in Bridge when called from Fortran
+    *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
+  }
+
+  /**
+   * Delete a Bridge.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   */
+  void fbridgedelete_( CppObjectInFortran** ppbridge )
+  {
+    Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
+    delete pbridge;
+#ifdef MGONGPUCPP_GPUIMPL
+    GpuRuntime::tearDown();
+#endif
+  }
+
+  /**
+   * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
+   * @param rndhel the pointer to the input random numbers for helicity selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelIds the pointer to the input channels i.e. Feynman diagrams to enhance (1 to n: 0 is an invalid value!)
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const unsigned int* iflavorVec,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly )
+  {
+    Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Use the device/GPU implementation in the CUDA library
+    // (there is also a host implementation in this library)
+    pbridge->gpu_sequence( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly );
+#else
+    // Use the host/CPU implementation in the C++ library
+    // (there is no device implementation in this library)
+    pbridge->cpu_sequence( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, mes, selhel, selcol, *pgoodHelOnly );
+#endif
+  }
+
+  /**
+   * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++, without multi-channel mode.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
+   * @param rndhel the pointer to the input random numbers for helicity selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
+   */
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const unsigned int* iflavorVec,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
+  {
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, iflavorVec, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
+  }
+
+  /**
+   * Retrieve the number of good helicities for helicity filtering in the Bridge.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param pngoodhel the pointer to the output number of good helicities
+   * @param pntothel the pointer to the output total number of helicities
+   */
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge,
+                            unsigned int* pngoodhel,
+                            unsigned int* pntothel )
+  {
+    Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    if( pbridge == 0 ) throw std::runtime_error( "fbridgegetngoodhel_: invalid Bridge address" );
+    *pngoodhel = pbridge->nGoodHel();
+    *pntothel = pbridge->nTotHel();
+  }
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h
new file mode 100644
index 0000000000..3f6c81e3d7
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: Z. Wettersten (Oct 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, A. Thete, A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+
+#ifndef _FBRIDGE_H_
+#define _FBRIDGE_H_
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  using FORTRANFPTYPE = double;
+
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F );
+
+  void fbridgedelete_( CppObjectInFortran** ppbridge );
+
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const unsigned int* iflavorVec,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* channelIds,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol,
+                         const bool* pgoodHelOnly );
+
+  void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
+                                        const FORTRANFPTYPE* momenta,
+                                        const FORTRANFPTYPE* gs,
+                                        const unsigned int* iflavorVec,
+                                        const FORTRANFPTYPE* rndhel,
+                                        const FORTRANFPTYPE* rndcol,
+                                        FORTRANFPTYPE* mes,
+                                        int* selhel,
+                                        int* selcol,
+                                        const bool* pgoodHelOnly );
+
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, unsigned int* pngoodhel, unsigned int* pntothel );
+}
+#endif // _FBRIDGE_H_
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc
new file mode 100644
index 0000000000..06e4ccee50
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc
@@ -0,0 +1,105 @@
+C Copyright (C) 2020-2024 CERN and UCLouvain.
+C Licensed under the GNU Lesser General Public License (version 3 or later).
+C Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
+C Further modified by: D. Massaro, A. Thete, A. Valassi (2022-2026) for the MG5aMC CUDACPP plugin.
+
+C
+C Create a Bridge and return its pointer
+C - PBRIDGE: the memory address of the C++ Bridge
+C - NEVT:    the number of events in the Fortran arrays
+C - NPAR:    the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C - NP4:     the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C
+      INTERFACE
+         SUBROUTINE FBRIDGECREATE(PBRIDGE, NEVT, NPAR, NP4)
+         INTEGER*8 PBRIDGE
+         INTEGER*4 NEVT
+         INTEGER*4 NPAR
+         INTEGER*4 NP4
+         END SUBROUTINE FBRIDGECREATE
+      END INTERFACE
+      
+C
+C Delete a Bridge.
+C - PBRIDGE: the memory address of the C++ Bridge
+C
+      INTERFACE
+         SUBROUTINE FBRIDGEDELETE(PBRIDGE)
+         INTEGER*8 PBRIDGE
+         END SUBROUTINE FBRIDGEDELETE
+      END INTERFACE
+      
+C
+C Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++.
+C - PBRIDGE:   the memory address of the C++ Bridge
+C - MOMENTA:   the input 4-momenta Fortran array
+C - GS:        the input Gs (running QCD coupling constant alphas) Fortran array
+C - IFLAV_VEC: the input array of flavor indices for the flavor combinations
+C - RNDHEL:    the input random number Fortran array for helicity selection
+C - RNDCOL:    the input random number Fortran array for color selection
+C - CHANID:    the input array of channels (Feynman diagrams) to enhance
+C - MES:       the output matrix element Fortran array
+C - SELHEL:    the output selected helicity Fortran array
+C - SELCOL:    the output selected color Fortran array
+C - HELONLY:   input flag, quit after computing good helicities?
+C
+      INTERFACE
+         SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
+     &     IFLAV_VEC, RNDHEL, RNDCOL, CHANID, MES, SELHEL,
+     &     SELCOL, HELONLY)
+         INTEGER*8 PBRIDGE
+         DOUBLE PRECISION MOMENTA(*)
+         DOUBLE PRECISION GS(*)
+         INTEGER*4 IFLAV_VEC(*)
+         DOUBLE PRECISION RNDHEL(*)
+         DOUBLE PRECISION RNDCOL(*)
+         INTEGER*4 CHANID(*)
+         DOUBLE PRECISION MES(*)
+         INTEGER*4 SELHEL(*)
+         INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
+         END SUBROUTINE FBRIDGESEQUENCE
+      END INTERFACE
+
+C
+C Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++.
+C - PBRIDGE:   the memory address of the C++ Bridge
+C - MOMENTA:   the input 4-momenta Fortran array
+C - GS:        the input Gs (running QCD coupling constant alphas) Fortran array
+C - IFLAV_VEC: the input array of flavor indices for the flavor combinations
+C - RNDHEL:    the input random number Fortran array for helicity selection
+C - RNDCOL:    the input random number Fortran array for color selection
+C - MES:       the output matrix element Fortran array
+C - SELHEL:    the output selected helicity Fortran array
+C - SELCOL:    the output selected color Fortran array
+C - HELONLY:   input flag, quit after computing good helicities?
+C
+      INTERFACE
+         SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
+     &     IFLAV_VEC, RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
+         INTEGER*8 PBRIDGE
+         DOUBLE PRECISION MOMENTA(*)
+         DOUBLE PRECISION GS(*)
+         INTEGER*4 IFLAV_VEC(*)
+         DOUBLE PRECISION RNDHEL(*)
+         DOUBLE PRECISION RNDCOL(*)
+         DOUBLE PRECISION MES(*)
+         INTEGER*4 SELHEL(*)
+         INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
+         END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
+      END INTERFACE
+
+C
+C Retrieve the number of good helicities for helicity filtering in the Bridge.
+C - PBRIDGE:  the memory address of the C++ Bridge
+C - NGOODHEL: the output number of good helicities
+C - NTOTHEL:  the output total number of helicities in cudacpp (aka NCOMB in Fortran)
+C
+      INTERFACE
+         SUBROUTINE FBRIDGEGETNGOODHEL(PBRIDGE, NGOODHEL, NTOTHEL)
+         INTEGER*8 PBRIDGE
+         INTEGER*4 NGOODHEL
+         INTEGER*4 NTOTHEL
+         END SUBROUTINE FBRIDGEGETNGOODHEL
+      END INTERFACE
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge_common.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge_common.inc
new file mode 100644
index 0000000000..c1d74a5d1d
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge_common.inc
@@ -0,0 +1,31 @@
+C Copyright (C) 2020-2024 CERN and UCLouvain.
+C Licensed under the GNU Lesser General Public License (version 3 or later).
+C Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+C Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+C
+C Common block for internal communication between MadEvent components
+C - FBRIDGE_MODE: the operation mode of MEs from Fortran and from the C++ Bridge
+C
+      INTEGER*4 FBRIDGE_MODE ! (CppOnly=1, FortranOnly=0, BothQuiet=-1, BothDebug=-2)
+      COMMON/TO_FBRIDGE_MODE/FBRIDGE_MODE
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+C
+C Common block for internal communication between MadEvent components
+C - FBRIDGE_*CBYF*: statistics for the CudaCpp by Fortran ratios of MEs
+C
+      INTEGER*8 FBRIDGE_NCBYF1  ! number of entries for ME ratio-1
+      DOUBLE PRECISION FBRIDGE_CBYF1SUM, FBRIDGE_CBYF1SUM2, ! sum/sum2/min/max ME ratio-1
+     &  FBRIDGE_CBYF1MIN, FBRIDGE_CBYF1MAX
+      COMMON/TO_FBRIDGE_CBYF1/FBRIDGE_NCBYF1,
+     &  FBRIDGE_CBYF1SUM, FBRIDGE_CBYF1SUM2,
+     &  FBRIDGE_CBYF1MIN, FBRIDGE_CBYF1MAX
+
+C
+C Common block for internal communication between MadEvent components
+C - FBRIDGE_PBRIDGE: the memory address of the C++ Bridge
+C
+      INTEGER*8 FBRIDGE_PBRIDGE ! 64bit memory address
+      COMMON/TO_FBRIDGE_PBRIDGE/FBRIDGE_PBRIDGE
+#endif
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f
new file mode 100644
index 0000000000..736d911387
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f
@@ -0,0 +1,90 @@
+C Copyright (C) 2020-2024 CERN and UCLouvain.
+C Licensed under the GNU Lesser General Public License (version 3 or later).
+C Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
+C Further modified by: D. Massaro, A. Thete, A. Valassi (2022-2026) for the MG5aMC CUDACPP plugin.
+
+      PROGRAM FCHECK_SA
+      IMPLICIT NONE
+      INCLUDE 'fsampler.inc'
+      INCLUDE 'fbridge.inc'
+      INTEGER*8 SAMPLER, BRIDGE ! 64bit memory addresses
+      INTEGER NEVTMAX, NEXTERNAL, NP4
+      PARAMETER(NEVTMAX=2048*256, NEXTERNAL=%(nexternal)d, NP4=4)
+      CHARACTER*32 ARG0, ARG1, ARG2, ARG3
+      INTEGER NARG1, NARG2, NARG3
+      INTEGER NEVT, NITER
+      INTEGER IEVT, IITER
+c     INTEGER IEXTERNAL
+      DOUBLE PRECISION MOMENTA(0:NP4-1, NEXTERNAL, NEVTMAX) ! c-array momenta[nevt][nexternal][np4]
+      DOUBLE PRECISION GS(NEVTMAX)
+      INTEGER IFLAV_VEC(NEVTMAX) ! index of the flavor combination to calculate
+      DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used
+      DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used
+      DOUBLE PRECISION MES(NEVTMAX)
+      INTEGER*4 SELHEL(NEVTMAX) ! not yet used
+      INTEGER*4 SELCOL(NEVTMAX) ! not yet used
+      DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision
+      INTEGER NEVTOK ! exclude nan/abnormal MEs
+
+      IFLAV_VEC(:) = 1
+C
+C READ COMMAND LINE ARGUMENTS
+C (NB: most errors will crash the program !)
+C
+      IF ( COMMAND_ARGUMENT_COUNT() == 3 ) THEN
+        CALL GET_COMMAND_ARGUMENT(1,ARG1)
+        CALL GET_COMMAND_ARGUMENT(2,ARG2)
+        CALL GET_COMMAND_ARGUMENT(3,ARG3)
+        READ (ARG1,'(I4)') NARG1
+        READ (ARG2,'(I4)') NARG2
+        READ (ARG3,'(I4)') NARG3
+        WRITE(6,*) "GPUBLOCKS=  ", NARG1
+        WRITE(6,*) "GPUTHREADS= ", NARG2
+        WRITE(6,*) "NITERATIONS=", NARG3
+        NEVT = NARG1 * NARG2
+        NITER = NARG3
+        IF ( NEVT > NEVTMAX ) THEN
+          WRITE(6,*) "ERROR! NEVT>NEVTMAX"
+          STOP
+        ENDIF
+      ELSE
+        CALL GET_COMMAND_ARGUMENT(0,ARG0)
+        WRITE(6,*) "Usage: ", TRIM(ARG0),
+     &    " gpublocks gputhreads niterations"
+        STOP
+      ENDIF
+C
+C USE SAMPLER AND BRIDGE
+C
+      NEVTOK = 0
+      MES_SUM = 0
+      CALL FBRIDGECREATE(BRIDGE, NEVT, NEXTERNAL, NP4) ! this must be at the beginning as it initialises the CUDA device
+      CALL FSAMPLERCREATE(SAMPLER, NEVT, NEXTERNAL, NP4)
+      DO IITER = 1, NITER
+        CALL FSAMPLERSEQUENCE(SAMPLER, MOMENTA)
+        DO IEVT = 1, NEVT
+          GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+        END DO
+        CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
+     &    IFLAV_VEC, RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
+        DO IEVT = 1, NEVT
+c         DO IEXTERNAL = 1, NEXTERNAL
+c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
+c    &        MOMENTA(0, IEXTERNAL, IEVT),
+c    &        MOMENTA(1, IEXTERNAL, IEVT),
+c    &        MOMENTA(2, IEXTERNAL, IEVT),
+c    &        MOMENTA(3, IEXTERNAL, IEVT)
+c         END DO
+c         WRITE(6,*) 'MES    ', IEVT, MES(IEVT)
+c         WRITE(6,*)
+          IF ( .NOT. ISNAN(MES(IEVT)) ) THEN
+            NEVTOK = NEVTOK + 1
+            MES_SUM = MES_SUM + MES(IEVT)
+          ENDIF
+        END DO
+      END DO
+      CALL FSAMPLERDELETE(SAMPLER)
+      CALL FBRIDGEDELETE(BRIDGE) ! this must be at the end as it shuts down the CUDA device
+      WRITE(6,*) 'Average Matrix Element:', MES_SUM/NEVT/NITER
+      WRITE(6,*) 'Abnormal MEs:', NEVT*NITER - NEVTOK
+      END PROGRAM FCHECK_SA
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
new file mode 100644
index 0000000000..833f96a180
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc
@@ -0,0 +1,165 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#include "mgOnGpuConfig.h"
+
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "MemoryBuffers.h"
+#include "RamboSamplingKernels.h"
+#include "RandomNumberKernels.h"
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  template<typename FORTRANFPTYPE>
+  class Sampler final : public CppObjectInFortran
+  {
+  public:
+    // Constructor
+    // @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran arrays
+    // @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+    // @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+    Sampler( int nevtF, int nparF, int np4F );
+    // Destructor
+    virtual ~Sampler() {}
+    // Delete copy/move constructors and assignment operators
+    Sampler( const Sampler& ) = delete;
+    Sampler( Sampler&& ) = delete;
+    Sampler& operator=( const Sampler& ) = delete;
+    Sampler& operator=( Sampler&& ) = delete;
+    // Draw random numbers and convert them to momenta in C++, then transpose them to Fortran momenta
+    void samplerHostSequence( FORTRANFPTYPE* fortranMomenta );
+  private:
+    const int m_nevt; // The number of events in each iteration
+    int m_iiter;      // The iteration counter (for random number seeding)
+#ifndef MGONGPUCPP_GPUIMPL
+    HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
+    HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
+    HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
+#else
+    PinnedHostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
+    PinnedHostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
+    PinnedHostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
+#endif
+    std::unique_ptr<RandomNumberKernelBase> m_prnk; // The appropriate RandomNumberKernel
+    std::unique_ptr<SamplingKernelBase> m_prsk;     // The appropriate SamplingKernel
+    // HARDCODED DEFAULTS
+    static constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  };
+
+  template<typename FORTRANFPTYPE>
+  Sampler<FORTRANFPTYPE>::Sampler( int nevtF, int nparF, int np4F )
+    : m_nevt( nevtF )
+    , m_iiter( 0 )
+    , m_hstRndmom( nevtF )
+    , m_hstMomenta( nevtF )
+    , m_hstWeights( nevtF )
+    , m_prnk( new CommonRandomNumberKernel( m_hstRndmom ) )
+    , m_prsk( new RamboSamplingKernelHost( energy, m_hstRndmom, m_hstMomenta, m_hstWeights, nevtF ) )
+  {
+    if( nparF != CPPProcess::npar ) throw std::runtime_error( "Sampler constructor: npar mismatch" );
+    if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Sampler constructor: np4 mismatch" );
+    std::cout << "WARNING! Instantiate host Sampler (nevt=" << m_nevt << ")" << std::endl;
+  }
+
+  // Draw random numbers and convert them to momenta in C++, then transpose them to Fortran momenta
+  template<typename FORTRANFPTYPE>
+  void Sampler<FORTRANFPTYPE>::samplerHostSequence( FORTRANFPTYPE* fortranMomenta )
+  {
+    std::cout << "Iteration #" << m_iiter + 1 << std::endl;
+    // === STEP 1 OF 3
+    // --- 1a. Seed rnd generator (to get same results on host and device in curand)
+    // [NB This should not be necessary using the host API: "Generation functions
+    // can be called multiple times on the same generator to generate successive
+    // blocks of results. For pseudorandom generators, multiple calls to generation
+    // functions will yield the same result as a single call with a large size."]
+    // *** NB! REMEMBER THAT THE FORTRAN SAMPLER ALWAYS USES COMMON RANDOM NUMBERS! ***
+    constexpr unsigned long long seed = 20200805;
+    m_prnk->seedGenerator( seed + m_iiter );
+    m_iiter++;
+    // --- 1b. Generate all relevant numbers to build nevt events (i.e. nevt phase space points) on the host
+    m_prnk->generateRnarray();
+    //std::cout << "Got random numbers" << std::endl;
+    // === STEP 2 OF 3
+    // --- 2a. Fill in momenta of initial state particles on the device
+    m_prsk->getMomentaInitial();
+    //std::cout << "Got initial momenta" << std::endl;
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    m_prsk->getMomentaFinal();
+    //std::cout << "Got final momenta" << std::endl;
+    // --- 2c. TransposeC2F
+    hst_transposeMomentaC2F( m_hstMomenta.data(), fortranMomenta, m_nevt );
+  }
+}
+
+//--------------------------------------------------------------------------
+
+extern "C"
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  /**
+   * The floating point precision used in Fortran arrays.
+   * This is presently hardcoded to double precision (REAL*8).
+   */
+  using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays
+  //using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays
+
+  /**
+   * Create a Sampler and return its pointer.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param nevtF the pointer to the number of events in the Fortran arrays
+   * @param nparF the pointer to the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F the pointer to the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   */
+  void fsamplercreate_( CppObjectInFortran** ppsampler, const int* pnevtF, const int* pnparF, const int* pnp4F )
+  {
+    *ppsampler = new Sampler<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
+  }
+
+  /**
+   * Delete a Sampler.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable)
+   */
+  void fsamplerdelete_( CppObjectInFortran** ppsampler )
+  {
+    Sampler<FORTRANFPTYPE>* psampler = dynamic_cast<Sampler<FORTRANFPTYPE>*>( *ppsampler );
+    if( psampler == 0 ) throw std::runtime_error( "fsamplerdelete_: invalid Sampler address" );
+    delete psampler;
+  }
+
+  /**
+   * Execute the matrix-element calculation "sequence" via a Sampler on GPU/CUDA or CUDA/C++.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param momenta the pointer to the input 4-momenta
+   * @param mes the pointer to the output matrix elements
+   */
+  void fsamplersequence_( CppObjectInFortran** ppsampler, FORTRANFPTYPE* momenta )
+  {
+    Sampler<FORTRANFPTYPE>* psampler = dynamic_cast<Sampler<FORTRANFPTYPE>*>( *ppsampler );
+    if( psampler == 0 ) throw std::runtime_error( "fsamplersequence_: invalid Sampler address" );
+    // Use the host/CPU implementation (there is no device implementation)
+    psampler->samplerHostSequence( momenta );
+  }
+}
+
+//--------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.inc
new file mode 100644
index 0000000000..2865ed5062
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.inc
@@ -0,0 +1,42 @@
+C Copyright (C) 2020-2024 CERN and UCLouvain.
+C Licensed under the GNU Lesser General Public License (version 3 or later).
+C Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin.
+C Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+C
+C Create a Sampler and return its pointer
+C - PSAMPLER:  the memory address of the C++ Sampler
+C - NEVT:      the number of events in the Fortran arrays
+C - NPAR:      the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C - NP4:       the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C
+      INTERFACE
+         SUBROUTINE FSAMPLERCREATE(PSAMPLER, NEVT, NPAR, NP4)
+         INTEGER*8 PSAMPLER
+         INTEGER*4 NEVT
+         INTEGER*4 NPAR
+         INTEGER*4 NP4
+         END SUBROUTINE FSAMPLERCREATE
+      END INTERFACE
+      
+C
+C Delete a Sampler.
+C - PSAMPLER:  the memory address of the C++ Sampler
+C
+      INTERFACE
+         SUBROUTINE FSAMPLERDELETE(PSAMPLER)
+         INTEGER*8 PSAMPLER
+         END SUBROUTINE FSAMPLERDELETE
+      END INTERFACE
+      
+C
+C Execute the matrix-element calculation "sequence" via a Sampler on GPU/CUDA or CUDA/C++.
+C - PSAMPLER:  the memory address of the C++ Sampler
+C - MOMENTA:   the output 4-momenta Fortran array
+C
+      INTERFACE
+         SUBROUTINE FSAMPLERSEQUENCE(PSAMPLER, MOMENTA)
+         INTEGER*8 PSAMPLER
+         DOUBLE PRECISION MOMENTA(*)
+         END SUBROUTINE FSAMPLERSEQUENCE
+      END INTERFACE
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/makefile_wrapper.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/makefile_wrapper.mk
new file mode 100644
index 0000000000..59c862b17f
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/makefile_wrapper.mk
@@ -0,0 +1,3 @@
+SHELL := /bin/bash
+include makefile_original.mk
+include cudacpp_overlay.mk
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
new file mode 100644
index 0000000000..c32d0a2740
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -0,0 +1,297 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUCONFIG_H
+#define MGONGPUCONFIG_H 1
+
+// HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
+// There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
+%(mgongpu_supports_multichannel)s
+
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
+// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined
+// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc)
+#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand)
+
+// ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
+// ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
+
+// Choose if curand is supported for generating random numbers
+// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead)
+// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
+#if defined __HIPCC__
+#define MGONGPU_HAS_NO_CURAND 1
+#else
+//#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#else
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+//#endif
+#endif
+
+// Choose if hiprand is supported for generating random numbers
+// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead)
+// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND
+// (there may exist HIP installations which do not include hiprand?)
+#if defined __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#define MGONGPU_HAS_NO_HIPRAND 1
+#else
+//#ifdef __HIPCC__
+//#undef MGONGPU_HAS_NO_HIPRAND // default
+////#define MGONGPU_HAS_NO_HIPRAND 1
+//#else
+//#undef MGONGPU_HAS_NO_HIPRAND // default
+////#define MGONGPU_HAS_NO_HIPRAND 1
+//#endif
+#endif
+
+// Choose floating point precision (for everything but color algebra #537)
+// If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
+#if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT
+// Floating point precision (CHOOSE ONLY ONE)
+#define MGONGPU_FPTYPE_DOUBLE 1 // default
+//#define MGONGPU_FPTYPE_FLOAT 1 // 2x faster
+#endif
+
+// Choose floating point precision (for color algebra alone #537)
+// If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE2_FLOAT, nothing happens (issue #167)
+#if not defined MGONGPU_FPTYPE2_DOUBLE and not defined MGONGPU_FPTYPE2_FLOAT
+// Floating point precision (CHOOSE ONLY ONE)
+#define MGONGPU_FPTYPE2_DOUBLE 1 // default
+//#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
+#endif
+
+// Choose whether to inline all HelAmps functions
+// This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
+// By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
+//#undef MGONGPU_INLINE_HELAMPS // default
+////#define MGONGPU_INLINE_HELAMPS 1
+
+// Choose whether to hardcode the cIPD physics parameters rather than reading them from user cards
+// This optimization can gain 20%% in CUDA in eemumu (issue #39)
+// By default, do not hardcode, but allow this macro to be set from outside with e.g. -DMGONGPU_HARDCODE_PARAM
+// ** NB: The option to use hardcoded cIPD physics parameters is supported again even now when alphas is running (#373)
+// ** NB: Note however that it now only refers to cIPD parameters (cIPC parameters are always accessed through global memory)
+//#undef MGONGPU_HARDCODE_PARAM // default
+////#define MGONGPU_HARDCODE_PARAM 1
+
+/* clang-format off */
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
+//#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
+//#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810)
+// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx)
+#elif defined __HIPCC__
+#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
+#endif /* clang-format on */
+
+// Choose whether to enable or disable channelid debug printouts
+#ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+#undef MGONGPU_CHANNELID_DEBUG // multichannel is not enabled
+#else
+// By default, do not hardcode, but allow this macro to be set from outside with e.g. -DMGONGPU_CHANNELID_DEBUG
+//#undef MGONGPU_CHANNELID_DEBUG // default
+////#define MGONGPU_CHANNELID_DEBUG 1
+#endif
+
+// SANITY CHECKS (floating point precision for everything but color algebra #537)
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE_FLOAT
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE_DOUBLE or defined MGONGPU_FPTYPE_FLOAT
+#endif
+
+// SANITY CHECKS (floating point precision for color algebra alone #537)
+#if defined MGONGPU_FPTYPE2_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE2_DOUBLE or defined MGONGPU_FPTYPE2_FLOAT
+#endif
+#if defined MGONGPU_FPTYPE2_DOUBLE and defined MGONGPU_FPTYPE_FLOAT
+#error You cannot use double precision for color algebra and single precision elsewhere
+#endif
+
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#endif
+#endif
+
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
+#endif
+#endif
+
+// NB: namespace mgOnGpu includes types which are defined in exactly the same way for CPU and GPU builds (see #318 and #725)
+namespace mgOnGpu
+{
+
+  // --- Type definitions
+
+  // Floating point type (for everything but color algebra #537): fptype
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef double fptype; // double precision (8 bytes, fp64)
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef float fptype;  // single precision (4 bytes, fp32)
+#endif
+
+  // Floating point type (for color algebra alone #537): fptype2
+#if defined MGONGPU_FPTYPE2_DOUBLE
+  typedef double fptype2; // double precision (8 bytes, fp64)
+#elif defined MGONGPU_FPTYPE2_FLOAT
+  typedef float fptype2; // single precision (4 bytes, fp32)
+#endif
+
+  // --- Platform-specific software implementation details
+
+  // Maximum number of blocks per grid
+  // ** NB Some arrays of pointers will be allocated statically to fit all these blocks
+  // ** (the actual memory for each block will then be allocated dynamically only for existing blocks)
+  //const int nbpgMAX = 2048;
+
+  // Maximum number of threads per block
+  //const int ntpbMAX = 256; // AV Apr2021: why had I set this to 256?
+  const int ntpbMAX = 1024; // NB: 512 is ok, but 1024 does fail with "too many resources requested for launch"
+
+  // Alignment requirement for using reinterpret_cast with SIMD vectorized code
+  // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
+  // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
+#ifndef MGONGPUCPP_GPUIMPL
+  constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
+#endif
+
+}
+
+// Expose typedefs and operators outside the namespace
+using mgOnGpu::fptype;
+using mgOnGpu::fptype2;
+
+// C++ SIMD vectorization width (this will be used to set neppV)
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
+#undef MGONGPU_CPPSIMD
+#elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 8
+#else
+#define MGONGPU_CPPSIMD 16
+#endif
+#elif defined __AVX512VL__ // C++ "512y" AVX512 with 256 width (256-bit ie 32-byte): 4 (DOUBLE) or 8 (FLOAT) [gcc DEFAULT]
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 4
+#else
+#define MGONGPU_CPPSIMD 8
+#endif
+#elif defined __AVX2__ // C++ "avx2" AVX2 (256-bit ie 32-byte): 4 (DOUBLE) or 8 (FLOAT) [clang DEFAULT]
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 4
+#else
+#define MGONGPU_CPPSIMD 8
+#endif
+#elif defined __SSE4_2__ // C++ "sse4" SSE4.2 (128-bit ie 16-byte): 2 (DOUBLE) or 4 (FLOAT) [Power9 and ARM default]
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 2
+#else
+#define MGONGPU_CPPSIMD 4
+#endif
+#else // C++ "none" i.e. no SIMD
+#undef MGONGPU_CPPSIMD
+#endif
+
+/* clang-format off */
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
+// Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
+#define mgDebugDeclare() /*noop*/
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
+
+// Define empty CUDA/HIP declaration specifiers for C++
+#ifndef MGONGPUCPP_GPUIMPL
+#define __global__
+#define __host__
+#define __device__
+#endif
+
+// For SANITY CHECKS: check that neppR, neppM, neppV... are powers of two (https://stackoverflow.com/a/108360)
+inline constexpr bool
+ispoweroftwo( int n )
+{
+  return ( n > 0 ) && !( n & ( n - 1 ) );
+}
+
+// Compiler version support (#96): require nvcc from CUDA >= 11.2, e.g. to use C++17 (see #333)
+#ifdef __NVCC__
+#if( __CUDACC_VER_MAJOR__ < 11 ) || ( __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 2 )
+#error Unsupported CUDA version: please use CUDA >= 11.2
+#endif
+#endif
+
+// Compiler version support (#96): require clang >= 11
+#if defined __clang__
+#if( __clang_major__ < 11 )
+#error Unsupported clang version: please use clang >= 11
+#endif
+// Compiler version support (#96): require gcc >= 9.3, e.g. for some OMP issues (see #269)
+// [NB skip this check for the gcc toolchain below clang or icx (TEMPORARY? #355)]
+#elif defined __GNUC__
+#if( __GNUC__ < 9 ) || ( __GNUC__ == 9 && __GNUC_MINOR__ < 3 )
+#error Unsupported gcc version: please gcc >= 9.3
+#endif
+#endif
+
+#endif // MGONGPUCONFIG_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
new file mode 100644
index 0000000000..92d74fd6db
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -0,0 +1,744 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUCXTYPES_H
+#define MGONGPUCXTYPES_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuFptypes.h"
+
+#include <iostream>
+
+//==========================================================================
+// COMPLEX TYPES: (PLATFORM-SPECIFIC) HEADERS
+//==========================================================================
+
+#include <complex>
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#if defined MGONGPU_CUCXTYPE_THRUST
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
+#include <thrust/complex.h>
+#pragma clang diagnostic pop
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#include <cuComplex.h>
+#elif not defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+#endif
+// Complex type in HIP: cxsmpl
+#elif defined __HIPCC__
+#if not defined MGONGPU_HIPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL
+#endif
+#else
+// Complex type in c++ or HIP: std::complex or cxsmpl
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#include <cmath>
+#elif not defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+#endif
+#endif
+
+//==========================================================================
+// COMPLEX TYPES: INSTRUMENTED CUCOMPLEX CLASS (cucomplex)
+//==========================================================================
+
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+namespace mg5amcGpu
+{
+#if defined MGONGPU_FPTYPE_DOUBLE
+  class cucomplex
+  {
+  public:
+    __host__ __device__ cucomplex( const double& r = 0, const double& i = 0 )
+      : m_ri( make_cuDoubleComplex( r, i ) ) {}
+    __host__ __device__ constexpr cucomplex( const cuDoubleComplex& ri )
+      : m_ri( ri ) {}
+    //__host__ __device__ operator cuDoubleComplex&() { return m_ri; }
+    __host__ __device__ constexpr operator cuDoubleComplex() const { return m_ri; }
+    __host__ __device__ double real() const { return cuCreal( m_ri ); }
+    __host__ __device__ double imag() const { return cuCimag( m_ri ); }
+    inline __host__ __device__ cucomplex& operator+=( const cucomplex& c )
+    {
+      m_ri = cuCadd( m_ri, c );
+      return *this;
+    }
+    inline __host__ __device__ cucomplex& operator-=( const cucomplex& c )
+    {
+      m_ri = cuCsub( m_ri, c );
+      return *this;
+    }
+  private:
+    cuDoubleComplex m_ri;
+  };
+#elif defined MGONGPU_FPTYPE_FLOAT
+  class cucomplex
+  {
+  public:
+    __host__ __device__ cucomplex( const float& r = 0, const float& i = 0 )
+      : m_ri( make_cuFloatComplex( r, i ) ) {}
+    __host__ __device__ constexpr cucomplex( const cuFloatComplex& ri )
+      : m_ri( ri ) {}
+    //__host__ __device__ operator cuFloatComplex&() { return m_ri; }
+    __host__ __device__ constexpr operator cuFloatComplex() const { return m_ri; }
+    __host__ __device__ float real() const { return cuCrealf( m_ri ); }
+    __host__ __device__ float imag() const { return cuCimagf( m_ri ); }
+    inline __host__ __device__ cucomplex& operator+=( const cucomplex& c )
+    {
+      m_ri = cuCaddf( m_ri, c );
+      return *this;
+    }
+    inline __host__ __device__ cucomplex& operator-=( const cucomplex& c )
+    {
+      m_ri = cuCsubf( m_ri, c );
+      return *this;
+    }
+  private:
+    cuFloatComplex m_ri;
+  };
+#endif
+}
+#endif
+#endif
+
+//==========================================================================
+// COMPLEX TYPES: SIMPLE COMPLEX CLASS (cxsmpl)
+//==========================================================================
+
+// NB: namespace mgOnGpu includes types which are defined in exactly the same way for CPU and GPU builds (see #318 and #725)
+namespace mgOnGpu /* clang-format off */
+{
+  // The number of floating point types in a complex type (real, imaginary)
+  constexpr int nx2 = 2;
+
+  // --- Type definition (simple complex type derived from cxtype_v)
+  template<typename FP>
+  class cxsmpl
+  {
+  public:
+    __host__ __device__ constexpr cxsmpl() : m_real( 0 ), m_imag( 0 ) {}
+    cxsmpl( const cxsmpl& ) = default;
+    cxsmpl( cxsmpl&& ) = default;
+    __host__ __device__ constexpr cxsmpl( const FP& r, const FP& i = 0 ) : m_real( r ), m_imag( i ) {}
+    __host__ __device__ constexpr cxsmpl( const std::complex<FP>& c ) : m_real( c.real() ), m_imag( c.imag() ) {}
+    cxsmpl& operator=( const cxsmpl& ) = default;
+    cxsmpl& operator=( cxsmpl&& ) = default;
+    __host__ __device__ constexpr cxsmpl& operator+=( const cxsmpl& c ) { m_real += c.real(); m_imag += c.imag(); return *this; }
+    __host__ __device__ constexpr cxsmpl& operator-=( const cxsmpl& c ) { m_real -= c.real(); m_imag -= c.imag(); return *this; }
+    __host__ __device__ constexpr const FP& real() const { return m_real; }
+    __host__ __device__ constexpr const FP& imag() const { return m_imag; }
+    template<typename FP2> __host__ __device__ constexpr operator cxsmpl<FP2>() const { return cxsmpl<FP2>( m_real, m_imag ); }
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#ifdef MGONGPU_CUCXTYPE_THRUST
+    template<typename FP2> __host__ __device__ constexpr operator thrust::complex<FP2>() const { return thrust::complex<FP2>( m_real, m_imag ); }
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX 
+    __host__ __device__ constexpr operator mg5amcGpu::cucomplex() const { return mg5amcGpu::cucomplex( m_real, m_imag ); }
+#endif
+#else
+#ifdef MGONGPU_CPPCXTYPE_STDCOMPLEX
+    template<typename FP2> __host__ __device__ constexpr operator std::complex<FP2>() const { return std::complex<FP2>( m_real, m_imag ); }
+#endif
+#endif
+  private:
+    FP m_real, m_imag; // RI
+  };
+
+  template<typename FP>
+  constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl")
+  inline __host__ __device__ cxsmpl<FP>
+  conj( const cxsmpl<FP>& c )
+  {
+    return cxsmpl<FP>( c.real(), -c.imag() );
+  }
+} /* clang-format on */
+
+// Expose the cxsmpl class outside the namespace
+using mgOnGpu::cxsmpl;
+
+// Printout to stream for user defined types
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  template<typename FP>
+  inline __host__ std::ostream&
+  operator<<( std::ostream& out, const cxsmpl<FP>& c )
+  {
+    //out << std::complex<FP>( c.real(), c.imag() );
+    out << "(" << c.real() << ", " << c.imag() << ")"; // add a space after the comma
+    return out;
+  }
+
+  // Operators for cxsmpl
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator+( const cxsmpl<FP> a )
+  {
+    return a;
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator-( const cxsmpl<FP>& a )
+  {
+    return cxsmpl<FP>( -a.real(), -a.imag() );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator+( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a.real() + b.real(), a.imag() + b.imag() );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator+( const FP& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a, 0 ) + b;
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator-( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a.real() - b.real(), a.imag() - b.imag() );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator-( const FP& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a, 0 ) - b;
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator*( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator*( const FP& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a, 0 ) * b;
+  }
+
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const double& a, const cxsmpl<float>& b )
+  {
+    return cxsmpl<float>( a, 0 ) * b;
+  }
+
+  inline __host__ __device__ constexpr cxsmpl<float>
+  operator*( const cxsmpl<float>& a, const double& b )
+  {
+    return a * cxsmpl<float>( b, 0 );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+  {
+    FP bnorm = b.real() * b.real() + b.imag() * b.imag();
+    return cxsmpl<FP>( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm,
+                       ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator/( const FP& a, const cxsmpl<FP>& b )
+  {
+    return cxsmpl<FP>( a, 0 ) / b;
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator+( const cxsmpl<FP>& a, const FP& b )
+  {
+    return a + cxsmpl<FP>( b, 0 );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator-( const cxsmpl<FP>& a, const FP& b )
+  {
+    return a - cxsmpl<FP>( b, 0 );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator*( const cxsmpl<FP>& a, const FP& b )
+  {
+    return a * cxsmpl<FP>( b, 0 );
+  }
+
+  template<typename FP>
+  inline __host__ __device__ constexpr cxsmpl<FP>
+  operator/( const cxsmpl<FP>& a, const FP& b )
+  {
+    return a / cxsmpl<FP>( b, 0 );
+  }
+}
+
+//==========================================================================
+// COMPLEX TYPES: (PLATFORM-SPECIFIC) TYPEDEFS
+//==========================================================================
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  // --- Type definitions (complex type: cxtype)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#if defined MGONGPU_CUCXTYPE_THRUST
+  typedef thrust::complex<fptype> cxtype;
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
+  typedef cucomplex cxtype;
+#else
+  typedef cxsmpl<fptype> cxtype;
+#endif
+#else // c++
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+  typedef std::complex<fptype> cxtype;
+#else
+  typedef cxsmpl<fptype> cxtype;
+#endif
+#endif
+
+  // SANITY CHECK: memory access may be based on casts of fptype[2] to cxtype (e.g. for wavefunctions)
+  static_assert( sizeof( cxtype ) == mgOnGpu::nx2 * sizeof( fptype ), "sizeof(cxtype) is not 2*sizeof(fptype)" );
+}
+
+// DANGEROUS! this was mixing different cxtype definitions for CPU and GPU builds (see #318 and #725)
+// DO NOT expose typedefs and operators outside the namespace
+//using mgOnGpu::cxtype;
+
+//==========================================================================
+// COMPLEX TYPES: (PLATFORM-SPECIFIC) FUNCTIONS AND OPERATORS
+//==========================================================================
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+
+  //------------------------------
+  // CUDA or C++ - using cxsmpl
+  //------------------------------
+
+  inline __host__ __device__ cxtype
+  cxmake( const fptype& r, const fptype& i )
+  {
+    return cxtype( r, i ); // cxsmpl constructor
+  }
+
+  inline __host__ __device__ fptype
+  cxreal( const cxtype& c )
+  {
+    return c.real(); // cxsmpl::real()
+  }
+
+  inline __host__ __device__ fptype
+  cximag( const cxtype& c )
+  {
+    return c.imag(); // cxsmpl::imag()
+  }
+
+  inline __host__ __device__ cxtype
+  cxconj( const cxtype& c )
+  {
+    return conj( c ); // conj( cxsmpl )
+  }
+
+  inline __host__ cxtype                 // NOT __device__
+  cxmake( const std::complex<float>& c ) // std::complex to cxsmpl (float-to-float or float-to-double)
+  {
+    return cxmake( c.real(), c.imag() );
+  }
+
+  inline __host__ cxtype                  // NOT __device__
+  cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-float or double-to-double)
+  {
+    return cxmake( c.real(), c.imag() );
+  }
+
+#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+
+  //==========================================================================
+
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDACC__ and not MGONGPUCPP_GPUIMPL)
+
+  //------------------------------
+  // CUDA - using thrust::complex
+  //------------------------------
+
+  inline __host__ __device__ cxtype
+  cxmake( const fptype& r, const fptype& i )
+  {
+    return cxtype( r, i ); // thrust::complex<fptype> constructor
+  }
+
+  inline __host__ __device__ fptype
+  cxreal( const cxtype& c )
+  {
+    return c.real(); // thrust::complex<fptype>::real()
+  }
+
+  inline __host__ __device__ fptype
+  cximag( const cxtype& c )
+  {
+    return c.imag(); // thrust::complex<fptype>::imag()
+  }
+
+  inline __host__ __device__ cxtype
+  cxconj( const cxtype& c )
+  {
+    return conj( c ); // conj( thrust::complex<fptype> )
+  }
+
+  inline __host__ __device__ const cxtype&
+  cxmake( const cxtype& c )
+  {
+    return c;
+  }
+
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+
+  //==========================================================================
+
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDACC__ and not MGONGPUCPP_GPUIMPL)
+
+  //------------------------------
+  // CUDA - using cuComplex
+  //------------------------------
+
+#if defined MGONGPU_FPTYPE_DOUBLE // cuda + cucomplex + double
+
+  //+++++++++++++++++++++++++
+  // cuDoubleComplex ONLY
+  //+++++++++++++++++++++++++
+
+  inline __host__ __device__ cxtype
+  cxmake( const fptype& r, const fptype& i )
+  {
+    return make_cuDoubleComplex( r, i );
+  }
+
+  inline __host__ __device__ fptype
+  cxreal( const cxtype& c )
+  {
+    return cuCreal( c ); // returns by value
+  }
+
+  inline __host__ __device__ fptype
+  cximag( const cxtype& c )
+  {
+    return cuCimag( c ); // returns by value
+  }
+
+  inline __host__ __device__ cxtype
+  operator+( const cxtype& a, const cxtype& b )
+  {
+    return cuCadd( a, b );
+  }
+
+  inline __host__ __device__ cxtype
+  operator-( const cxtype& a, const cxtype& b )
+  {
+    return cuCsub( a, b );
+  }
+
+  inline __host__ __device__ cxtype
+  operator*( const cxtype& a, const cxtype& b )
+  {
+    return cuCmul( a, b );
+  }
+
+  inline __host__ __device__ cxtype
+  operator/( const cxtype& a, const cxtype& b )
+  {
+    return cuCdiv( a, b );
+  }
+
+  inline __host__ std::ostream&
+  operator<<( std::ostream& out, const cxtype& c )
+  {
+    //out << std::complex<double>( cxreal( c ), cximag( c ) );
+    out << "(" << cxreal( c ) << ", " << cximag( c ) << ")"; // add a space after the comma
+    return out;
+  }
+
+#elif defined MGONGPU_FPTYPE_FLOAT // cuda + cucomplex + float
+
+  //+++++++++++++++++++++++++
+  // cuFloatComplex ONLY
+  //+++++++++++++++++++++++++
+
+  inline __host__ __device__ cxtype
+  cxmake( const fptype& r, const fptype& i )
+  {
+    return make_cuFloatComplex( r, i );
+  }
+
+  inline __host__ __device__ fptype
+  cxreal( const cxtype& c )
+  {
+    return cuCrealf( c ); // returns by value
+  }
+
+  inline __host__ __device__ fptype
+  cximag( const cxtype& c )
+  {
+    return cuCimagf( c ); // returns by value
+  }
+
+  inline __host__ __device__ cxtype
+  operator+( const cxtype& a, const cxtype& b )
+  {
+    return cuCaddf( a, b );
+  }
+
+  inline __host__ __device__ cxtype
+  operator-( const cxtype& a, const cxtype& b )
+  {
+    return cuCsubf( a, b );
+  }
+
+  inline __host__ __device__ cxtype
+  operator*( const cxtype& a, const cxtype& b )
+  {
+    return cuCmulf( a, b );
+  }
+
+  inline __host__ __device__ cxtype
+  operator/( const cxtype& a, const cxtype& b )
+  {
+    return cuCdivf( a, b );
+  }
+
+  inline __host__ cxtype                  // NOT __device__
+  cxmake( const std::complex<double>& c ) // std::complex to cucomplex (cast double-to-float)
+  {
+    return cxmake( (fptype)c.real(), (fptype)c.imag() );
+  }
+
+  inline __host__ std::ostream&
+  operator<<( std::ostream& out, const cxtype& c )
+  {
+    //out << std::complex<float>( cxreal( c ), cximag( c ) );
+    out << "(" << cxreal( c ) << ", " << cximag( c ) << ")"; // add a space after the comma
+    return out;
+  }
+
+#endif
+
+  //+++++++++++++++++++++++++
+  // cuDoubleComplex OR
+  // cuFloatComplex
+  //+++++++++++++++++++++++++
+
+  inline __host__ __device__ cxtype
+  operator+( const cxtype a )
+  {
+    return a;
+  }
+
+  inline __host__ __device__ cxtype
+  operator-( const cxtype& a )
+  {
+    return cxmake( -cxreal( a ), -cximag( a ) );
+  }
+
+  inline __host__ __device__ cxtype
+  operator+( const fptype& a, const cxtype& b )
+  {
+    return cxmake( a, 0 ) + b;
+  }
+
+  inline __host__ __device__ cxtype
+  operator-( const fptype& a, const cxtype& b )
+  {
+    return cxmake( a, 0 ) - b;
+  }
+
+  inline __host__ __device__ cxtype
+  operator*( const fptype& a, const cxtype& b )
+  {
+    return cxmake( a, 0 ) * b;
+  }
+
+  inline __host__ __device__ cxtype
+  operator/( const fptype& a, const cxtype& b )
+  {
+    return cxmake( a, 0 ) / b;
+  }
+
+  inline __host__ __device__ cxtype
+  operator+( const cxtype& a, const fptype& b )
+  {
+    return a + cxmake( b, 0 );
+  }
+
+  inline __host__ __device__ cxtype
+  operator-( const cxtype& a, const fptype& b )
+  {
+    return a - cxmake( b, 0 );
+  }
+
+  inline __host__ __device__ cxtype
+  operator*( const cxtype& a, const fptype& b )
+  {
+    return a * cxmake( b, 0 );
+  }
+
+  inline __host__ __device__ cxtype
+  operator/( const cxtype& a, const fptype& b )
+  {
+    return a / cxmake( b, 0 );
+  }
+
+  inline __host__ __device__ cxtype
+  cxconj( const cxtype& c )
+  {
+    return cxmake( cxreal( c ), -cximag( c ) );
+  }
+
+  inline __host__ cxtype                  // NOT __device__
+  cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-float or double-to-double)
+  {
+    return cxmake( c.real(), c.imag() );
+  }
+
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+
+  //==========================================================================
+
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDACC__ and not MGONGPUCPP_GPUIMPL)
+
+  //------------------------------
+  // C++ - using std::complex
+  //------------------------------
+
+  inline cxtype
+  cxmake( const fptype& r, const fptype& i )
+  {
+    return cxtype( r, i ); // std::complex<fptype> constructor
+  }
+
+  inline fptype
+  cxreal( const cxtype& c )
+  {
+    return c.real(); // std::complex<fptype>::real()
+  }
+
+  inline fptype
+  cximag( const cxtype& c )
+  {
+    return c.imag(); // std::complex<fptype>::imag()
+  }
+
+  inline cxtype
+  cxconj( const cxtype& c )
+  {
+    return conj( c ); // conj( std::complex<fptype> )
+  }
+
+  inline const cxtype&
+  cxmake( const cxtype& c ) // std::complex to std::complex (float-to-float or double-to-double)
+  {
+    return c;
+  }
+
+#if defined MGONGPU_FPTYPE_FLOAT
+  inline cxtype
+  cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast double-to-float)
+  {
+    return cxmake( (fptype)c.real(), (fptype)c.imag() );
+  }
+#endif
+
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+
+  //==========================================================================
+
+  inline __host__ __device__ const cxtype
+  cxmake( const cxsmpl<float>& c ) // cxsmpl to cxtype (float-to-float or float-to-double)
+  {
+    return cxmake( c.real(), c.imag() );
+  }
+
+  inline __host__ __device__ const cxtype
+  cxmake( const cxsmpl<double>& c ) // cxsmpl to cxtype (double-to-float or double-to-double)
+  {
+    return cxmake( c.real(), c.imag() );
+  }
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+//==========================================================================
+// COMPLEX TYPES: WRAPPER OVER RI FLOATING POINT PAIR (cxtype_ref)
+//==========================================================================
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  // The cxtype_ref class (a const reference to two non-const fp variables) was originally designed for cxtype_v::operator[]
+  // It used to be included in the code only when MGONGPU_HAS_CPPCXTYPEV_BRK (originally MGONGPU_HAS_CPPCXTYPE_REF) is defined
+  // It is now always included in the code because it is needed also to access an fptype wavefunction buffer as a cxtype
+  class cxtype_ref
+  {
+  public:
+    cxtype_ref() = delete;
+    cxtype_ref( const cxtype_ref& ) = delete;
+    cxtype_ref( cxtype_ref&& ) = default; // copy const refs
+    __host__ __device__ cxtype_ref( fptype& r, fptype& i )
+      : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
+    cxtype_ref& operator=( const cxtype_ref& ) = delete;
+    //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    {
+      *m_preal = cxreal( c );
+      *m_pimag = cximag( c );
+      return *this;
+    } // copy (assign) non-const values
+    __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
+  private:
+    fptype* const m_preal; // const pointer to non-const fptype R
+    fptype* const m_pimag; // const pointer to non-const fptype I
+  };
+
+  // Printout to stream for user defined types
+  inline __host__ __device__ std::ostream&
+  operator<<( std::ostream& out, const cxtype_ref& c )
+  {
+    out << (cxtype)c;
+    return out;
+  }
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+//==========================================================================
+
+#endif // MGONGPUCXTYPES_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
new file mode 100644
index 0000000000..960beeeeae
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h
@@ -0,0 +1,101 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUFPTYPES_H
+#define MGONGPUFPTYPES_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include <algorithm>
+#include <cmath>
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL // cuda
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //==========================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL // cuda
+
+  //------------------------------
+  // Floating point types - Cuda
+  //------------------------------
+
+  /*
+  inline __host__ __device__ fptype
+  fpmax( const fptype& a, const fptype& b )
+  {
+    return max( a, b );
+  }
+
+  inline __host__ __device__ fptype
+  fpmin( const fptype& a, const fptype& b )
+  {
+    return min( a, b );
+  }
+  */
+
+  inline __host__ __device__ const fptype&
+  fpmax( const fptype& a, const fptype& b )
+  {
+    return ( ( b < a ) ? a : b );
+  }
+
+  inline __host__ __device__ const fptype&
+  fpmin( const fptype& a, const fptype& b )
+  {
+    return ( ( a < b ) ? a : b );
+  }
+
+  inline __host__ __device__ fptype
+  fpsqrt( const fptype& f )
+  {
+#if defined MGONGPU_FPTYPE_FLOAT
+    // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html
+    return sqrtf( f );
+#else
+    // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html
+    return sqrt( f );
+#endif
+  }
+
+#endif // #ifdef MGONGPUCPP_GPUIMPL
+
+  //==========================================================================
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+  //------------------------------
+  // Floating point types - C++
+  //------------------------------
+
+  inline const fptype&
+  fpmax( const fptype& a, const fptype& b )
+  {
+    return std::max( a, b );
+  }
+
+  inline const fptype&
+  fpmin( const fptype& a, const fptype& b )
+  {
+    return std::min( a, b );
+  }
+
+  inline fptype
+  fpsqrt( const fptype& f )
+  {
+    return std::sqrt( f );
+  }
+
+#endif // #ifndef MGONGPUCPP_GPUIMPL
+
+  //==========================================================================
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MGONGPUFPTYPES_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
new file mode 100644
index 0000000000..9f3533a875
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -0,0 +1,931 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUVECTORS_H
+#define MGONGPUVECTORS_H 1
+
+#include "mgOnGpuCxtypes.h"
+#include "mgOnGpuFptypes.h"
+
+#include <iostream>
+
+//==========================================================================
+
+//------------------------------
+// Vector types - C++
+//------------------------------
+
+#ifdef __clang__
+// If set: return a pair of (fptype&, fptype&) by non-const reference in cxtype_v::operator[]
+// This is forbidden in clang ("non-const reference cannot bind to vector element")
+// See also https://stackoverflow.com/questions/26554829
+//#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // clang test (compilation fails also on clang 12.0, issue #182)
+#undef MGONGPU_HAS_CPPCXTYPEV_BRK // clang default
+#elif defined __INTEL_COMPILER
+//#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // icc default?
+#undef MGONGPU_HAS_CPPCXTYPEV_BRK // icc test
+#else
+#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // gcc default
+//#undef MGONGPU_HAS_CPPCXTYPEV_BRK // gcc test (very slightly slower? issue #172)
+#endif
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+#ifdef MGONGPU_CPPSIMD
+
+  const int neppV = MGONGPU_CPPSIMD;
+
+  // SANITY CHECK: cppAlign must be a multiple of neppV * sizeof(fptype)
+  static_assert( mgOnGpu::cppAlign % ( neppV * sizeof( fptype ) ) == 0 );
+
+  // SANITY CHECK: check that neppV is a power of two
+  static_assert( ispoweroftwo( neppV ), "neppV is not a power of 2" );
+
+  // --- Type definition (using vector compiler extensions: need -march=...)
+  // For gcc: https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+  // For clang: https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors
+#ifdef __clang__
+  typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
+#else
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+#endif
+
+  // Mixed fptypes #537: float for color algebra and double elsewhere
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  const int neppV2 = MGONGPU_CPPSIMD * 2;
+  static_assert( mgOnGpu::cppAlign % ( neppV2 * sizeof( fptype2 ) ) == 0 );
+  static_assert( ispoweroftwo( neppV2 ), "neppV2 is not a power of 2" );
+#ifdef __clang__
+  typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
+#else
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+#endif
+#else
+  typedef fptype_v fptype2_v;
+#endif
+
+  // --- Type definition (using vector compiler extensions: need -march=...)
+  class cxtype_v // no need for "class alignas(2*sizeof(fptype_v)) cxtype_v"
+  {
+  public:
+    // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only)
+    // See https://en.cppreference.com/w/c/language/array_initialization#Notes
+    cxtype_v()
+      : m_real{ 0 }, m_imag{ 0 } {} // RRRR=0000 IIII=0000
+    cxtype_v( const cxtype_v& ) = default;
+    cxtype_v( cxtype_v&& ) = default;
+    cxtype_v( const fptype_v& r, const fptype_v& i )
+      : m_real( r ), m_imag( i ) {}
+    cxtype_v( const fptype_v& r )
+      : m_real( r ), m_imag{ 0 } {} // IIII=0000
+    cxtype_v( const fptype& r )
+      : m_real( fptype_v{} + r ), m_imag{ 0 } {} // IIII=0000
+    cxtype_v& operator=( const cxtype_v& ) = default;
+    cxtype_v& operator=( cxtype_v&& ) = default;
+    cxtype_v& operator+=( const cxtype_v& c )
+    {
+      m_real += c.real();
+      m_imag += c.imag();
+      return *this;
+    }
+    cxtype_v& operator-=( const cxtype_v& c )
+    {
+      m_real -= c.real();
+      m_imag -= c.imag();
+      return *this;
+    }
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    // NB: THIS IS THE FUNDAMENTAL DIFFERENCE BETWEEN MGONGPU_HAS_CPPCXTYPEV_BRK DEFINED AND NOT DEFINED
+    // NB: the alternative "clang" implementation is simpler: it simply does not have any bracket operator[]
+    //cxtype_ref operator[]( size_t i ) const { return cxtype_ref( m_real[i], m_imag[i] ); } // gcc14.2 build fails #1004
+    cxtype_ref operator[]( size_t i ) { return cxtype_ref( m_real[i], m_imag[i] ); }
+    cxtype operator[]( size_t i ) const { return cxtype( m_real[i], m_imag[i] ); }
+#endif
+    const fptype_v& real() const
+    {
+      return m_real;
+    }
+    const fptype_v& imag() const { return m_imag; }
+  private:
+    fptype_v m_real, m_imag; // RRRRIIII
+  };
+
+  // --- Type definition (using vector compiler extensions: need -march=...)
+#ifdef __clang__ // https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors
+  typedef unsigned int uint_v __attribute__( ( ext_vector_type( neppV ) ) );
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+#endif
+#else // gcc
+  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+#endif
+#endif
+
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL)
+
+  const int neppV = 1;
+
+#endif // #ifdef MGONGPU_CPPSIMD
+}
+
+//--------------------------------------------------------------------------
+
+// DANGEROUS! this was mixing different cxtype definitions for CPU and GPU builds (see #318 and #725)
+// DO NOT expose typedefs outside the namespace
+//using mgOnGpu::neppV;
+//#ifdef MGONGPU_CPPSIMD
+//using mgOnGpu::fptype_v;
+//using mgOnGpu::fptype2_v;
+//using mgOnGpu::cxtype_v;
+//using mgOnGpu::bool_v;
+//#endif
+
+//==========================================================================
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+#ifndef MGONGPUCPP_GPUIMPL
+
+  // Printout to stream for user defined types
+
+#ifndef MGONGPU_CPPCXTYPE_CXSMPL // operator<< for cxsmpl has already been defined!
+  inline std::ostream&
+  operator<<( std::ostream& out, const cxtype& c )
+  {
+    out << "[" << cxreal( c ) << "," << cximag( c ) << "]";
+    //out << cxreal(c) << "+i" << cximag(c);
+    return out;
+  }
+#endif
+
+  /*
+#ifdef MGONGPU_CPPSIMD
+  inline std::ostream&
+  operator<<( std::ostream& out, const bool_v& v )
+  {
+    out << "{ " << v[0];
+    for ( int i=1; i<neppV; i++ ) out << ", " << (bool)(v[i]);
+    out << " }";
+    return out;
+  }
+#endif
+  */
+
+#ifdef MGONGPU_CPPSIMD
+  inline std::ostream&
+  operator<<( std::ostream& out, const fptype_v& v )
+  {
+    out << "{ " << v[0];
+    for( int i = 1; i < neppV; i++ ) out << ", " << v[i];
+    out << " }";
+    return out;
+  }
+#endif
+
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  inline std::ostream&
+  operator<<( std::ostream& out, const fptype2_v& v )
+  {
+    out << "{ " << v[0];
+    for( int i = 1; i < neppV2; i++ ) out << ", " << v[i];
+    out << " }";
+    return out;
+  }
+#endif
+
+#ifdef MGONGPU_CPPSIMD
+  inline std::ostream&
+  operator<<( std::ostream& out, const cxtype_v& v )
+  {
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    out << "{ " << v[0];
+    for( int i = 1; i < neppV; i++ ) out << ", " << v[i];
+#else
+    out << "{ " << cxmake( v.real()[0], v.imag()[0] );
+    for( int i = 1; i < neppV; i++ ) out << ", " << cxmake( v.real()[i], v.imag()[i] );
+#endif
+    out << " }";
+    return out;
+  }
+#endif
+
+#ifdef MGONGPU_CPPSIMD
+  inline std::ostream&
+  operator<<( std::ostream& out, const uint_v& v )
+  {
+    out << "{ " << v[0];
+    for( int i = 1; i < neppV; i++ ) out << ", " << v[i];
+    out << " }";
+    return out;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  /*
+  // Printout to std::cout for user defined types
+
+  inline void print( const fptype& f ) { std::cout << f << std::endl; }
+
+#ifdef MGONGPU_CPPSIMD
+  inline void print( const fptype_v& v ) { std::cout << v << std::endl; }
+#endif
+
+  inline void print( const cxtype& c ) { std::cout << c << std::endl; }
+
+#ifdef MGONGPU_CPPSIMD
+  inline void print( const cxtype_v& v ) { std::cout << v << std::endl; }
+#endif
+  */
+
+  //--------------------------------------------------------------------------
+
+  // Functions and operators for fptype_v
+
+#ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
+  inline fptype_v
+  fpsqrt( const fptype_v& v )
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ ) out[i] = fpsqrt( v[i] );
+    return out;
+  }
+#endif
+
+  /*
+#ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpvmake( const fptype v[neppV] )
+  {
+    fptype_v out = {}; // see #594
+    for ( int i=0; i<neppV; i++ ) out[i] = v[i];
+    return out;
+  }
+#endif
+  */
+
+  //--------------------------------------------------------------------------
+
+  // Functions and operators for cxtype_v
+
+#ifdef MGONGPU_CPPSIMD
+
+  /*
+  inline cxtype_v
+  cxvmake( const cxtype c )
+  {
+    cxtype_v out;
+    for ( int i=0; i<neppV; i++ ) out[i] = c;
+    return out;
+  }
+  */
+
+  inline cxtype_v
+  cxmake( const fptype_v& r, const fptype_v& i )
+  {
+    return cxtype_v( r, i );
+  }
+
+  inline cxtype_v
+  cxmake( const fptype_v& r, const fptype& i )
+  {
+    //return cxtype_v( r, fptype_v{i} ); // THIS WAS A BUG! #339
+    return cxtype_v( r, fptype_v{} + i ); // IIII=0000+i=iiii
+  }
+
+  inline cxtype_v
+  cxmake( const fptype& r, const fptype_v& i )
+  {
+    //return cxtype_v( fptype_v{r}, i ); // THIS WAS A BUG! #339
+    return cxtype_v( fptype_v{} + r, i ); // IIII=0000+r=rrrr
+  }
+
+  inline const fptype_v&
+  cxreal( const cxtype_v& c )
+  {
+    return c.real(); // returns by reference
+  }
+
+  inline const fptype_v&
+  cximag( const cxtype_v& c )
+  {
+    return c.imag(); // returns by reference
+  }
+
+  inline const cxtype_v
+  cxconj( const cxtype_v& c )
+  {
+    return cxtype_v( c.real(), -c.imag() );
+  }
+
+  inline cxtype_v
+  operator+( const cxtype_v& a, const cxtype_v& b )
+  {
+    return cxtype_v( a.real() + b.real(), a.imag() + b.imag() );
+  }
+
+  inline cxtype_v
+  operator+( const fptype_v& a, const cxtype_v& b )
+  {
+    return cxtype_v( a + b.real(), b.imag() );
+  }
+
+  inline cxtype_v
+  operator+( const cxtype_v& a, const fptype_v& b )
+  {
+    return cxtype_v( a.real() + b, a.imag() );
+  }
+
+  inline const cxtype_v&
+  operator+( const cxtype_v& a )
+  {
+    return a;
+  }
+
+  inline cxtype_v
+  operator-( const cxtype_v& a, const cxtype_v& b )
+  {
+    return cxtype_v( a.real() - b.real(), a.imag() - b.imag() );
+  }
+
+  inline cxtype_v
+  operator-( const fptype& a, const cxtype_v& b )
+  {
+    return cxtype_v( a - b.real(), -b.imag() );
+  }
+
+  inline cxtype_v
+  operator-( const cxtype_v& a )
+  {
+    return 0 - a;
+  }
+
+  inline cxtype_v
+  operator-( const cxtype_v& a, const fptype& b )
+  {
+    return cxtype_v( a.real() - b, a.imag() );
+  }
+
+  inline cxtype_v
+  operator-( const fptype_v& a, const cxtype_v& b )
+  {
+    return cxtype_v( a - b.real(), -b.imag() );
+  }
+
+  inline cxtype_v
+  operator-( const cxtype_v& a, const fptype_v& b )
+  {
+    return cxtype_v( a.real() - b, a.imag() );
+  }
+
+  inline cxtype_v
+  operator-( const fptype_v& a, const cxtype& b )
+  {
+    return cxtype_v( a - b.real(), fptype_v{} - b.imag() ); // IIII=0000-b.imag()
+  }
+
+  inline cxtype_v
+  operator*( const cxtype_v& a, const cxtype_v& b )
+  {
+    return cxtype_v( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+  }
+
+  inline cxtype_v
+  operator*( const cxtype& a, const cxtype_v& b )
+  {
+    return cxtype_v( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+  }
+
+  inline cxtype_v
+  operator*( const cxtype_v& a, const cxtype& b )
+  {
+    return cxtype_v( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+  }
+
+  inline cxtype_v
+  operator*( const fptype& a, const cxtype_v& b )
+  {
+    return cxtype_v( a * b.real(), a * b.imag() );
+  }
+
+  inline cxtype_v
+  operator*( const cxtype_v& a, const fptype& b )
+  {
+    return cxtype_v( a.real() * b, a.imag() * b );
+  }
+
+  inline cxtype_v
+  operator*( const fptype_v& a, const cxtype_v& b )
+  {
+    return cxtype_v( a * b.real(), a * b.imag() );
+  }
+
+  inline cxtype_v
+  operator*( const cxtype_v& a, const fptype_v& b )
+  {
+    return cxtype_v( a.real() * b, a.imag() * b );
+  }
+
+  inline cxtype_v
+  operator*( const fptype_v& a, const cxtype& b )
+  {
+    return cxtype_v( a * b.real(), a * b.imag() );
+  }
+
+  inline cxtype_v
+  operator*( const cxtype& a, const fptype_v& b )
+  {
+    return cxtype_v( a.real() * b, a.imag() * b );
+  }
+
+  inline cxtype_v
+  operator/( const cxtype_v& a, const cxtype_v& b )
+  {
+    fptype_v bnorm = b.real() * b.real() + b.imag() * b.imag();
+    return cxtype_v( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm,
+                     ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm );
+  }
+
+  inline cxtype_v
+  operator/( const cxtype& a, const cxtype_v& b )
+  {
+    fptype_v bnorm = b.real() * b.real() + b.imag() * b.imag();
+    return cxtype_v( ( cxreal( a ) * b.real() + cximag( a ) * b.imag() ) / bnorm,
+                     ( cximag( a ) * b.real() - cxreal( a ) * b.imag() ) / bnorm );
+  }
+
+  inline cxtype_v
+  operator/( const fptype& a, const cxtype_v& b )
+  {
+    fptype_v bnorm = b.real() * b.real() + b.imag() * b.imag();
+    return cxtype_v( ( a * b.real() ) / bnorm, ( -a * b.imag() ) / bnorm );
+  }
+
+  inline cxtype_v
+  operator/( const cxtype_v& a, const fptype_v& b )
+  {
+    return cxtype_v( a.real() / b, a.imag() / b );
+  }
+
+  inline cxtype_v
+  operator/( const cxtype_v& a, const fptype& b )
+  {
+    return cxtype_v( a.real() / b, a.imag() / b );
+  }
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+  //--------------------------------------------------------------------------
+
+  // Functions and operators for bool_v (ternary and masks)
+
+#ifdef MGONGPU_CPPSIMD
+
+  inline fptype_v
+  fpternary( const bool_v& mask, const fptype_v& a, const fptype_v& b )
+  {
+    fptype_v out = {}; // see #594
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b[i] );
+    return out;
+  }
+
+  inline fptype_v
+  fpternary( const bool_v& mask, const fptype_v& a, const fptype& b )
+  {
+    fptype_v out = {}; // see #594
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b );
+    return out;
+  }
+
+  inline fptype_v
+  fpternary( const bool_v& mask, const fptype& a, const fptype_v& b )
+  {
+    fptype_v out = {}; // see #594
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b[i] );
+    return out;
+  }
+
+  inline fptype_v
+  fpternary( const bool_v& mask, const fptype& a, const fptype& b )
+  {
+    fptype_v out = {}; // see #594
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b );
+    return out;
+  }
+
+  inline cxtype_v
+  cxternary( const bool_v& mask, const cxtype_v& a, const cxtype_v& b )
+  {
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    cxtype_v out;
+    //for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b[i] ); // OLD error-prone depends on "cxtype_ref& operator=( cxtype_ref&& c )"
+    for( int i = 0; i < neppV; i++ ) out[i] = cxtype( mask[i] ? a[i] : b[i] );
+    return out;
+#else
+    fptype_v outr = {}; // see #594
+    fptype_v outi = {}; // see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      outr[i] = ( mask[i] ? a.real()[i] : b.real()[i] );
+      outi[i] = ( mask[i] ? a.imag()[i] : b.imag()[i] );
+    }
+    return cxtype_v( outr, outi );
+#endif
+  }
+
+  inline cxtype_v
+  cxternary( const bool_v& mask, const cxtype_v& a, const cxtype& b )
+  {
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    cxtype_v out;
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b );
+    return out;
+#else
+    fptype_v outr = {}; // see #594
+    fptype_v outi = {}; // see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      outr[i] = ( mask[i] ? a.real()[i] : b.real() );
+      outi[i] = ( mask[i] ? a.imag()[i] : b.imag() );
+    }
+    return cxtype_v( outr, outi );
+#endif
+  }
+
+  inline cxtype_v
+  cxternary( const bool_v& mask, const cxtype& a, const cxtype_v& b )
+  {
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    cxtype_v out;
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b[i] );
+    return out;
+#else
+    fptype_v outr = {}; // see #594
+    fptype_v outi = {}; // see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      outr[i] = ( mask[i] ? a.real() : b.real()[i] );
+      outi[i] = ( mask[i] ? a.imag() : b.imag()[i] );
+    }
+    return cxtype_v( outr, outi );
+#endif
+  }
+
+  inline cxtype_v
+  cxternary( const bool_v& mask, const cxtype& a, const cxtype& b )
+  {
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    cxtype_v out;
+    for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b );
+    return out;
+#else
+    fptype_v outr = {}; // see #594
+    fptype_v outi = {}; // see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      outr[i] = ( mask[i] ? a.real() : b.real() );
+      outi[i] = ( mask[i] ? a.imag() : b.imag() );
+    }
+    return cxtype_v( outr, outi );
+#endif
+  }
+
+  /*
+  inline bool
+  maskor( const bool_v& mask )
+  {
+    bool out = false;
+    for ( int i=0; i<neppV; i++ ) out = out || mask[i];
+    return out;
+  }
+  */
+
+  inline bool
+  maskand( const bool_v& mask )
+  {
+    bool out = true;
+    for( int i = 0; i < neppV; i++ ) out = out && mask[i];
+    return out;
+  }
+
+#else // i.e. #ifndef MGONGPU_CPPSIMD
+
+  inline fptype
+  fpternary( const bool& mask, const fptype& a, const fptype& b )
+  {
+    return ( mask ? a : b );
+  }
+
+  inline cxtype
+  cxternary( const bool& mask, const cxtype& a, const cxtype& b )
+  {
+    return ( mask ? a : b );
+  }
+
+  /*
+  inline bool
+  maskor( const bool& mask )
+  {
+    return mask;
+  }
+  */
+
+  inline bool
+  maskand( const bool& mask )
+  {
+    return mask;
+  }
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+  //--------------------------------------------------------------------------
+
+  // Functions and operators for fptype_v (min/max)
+
+#ifdef MGONGPU_CPPSIMD
+
+  inline fptype_v
+  fpmax( const fptype_v& a, const fptype_v& b )
+  {
+    return fpternary( ( b < a ), a, b );
+  }
+
+  inline fptype_v
+  fpmax( const fptype_v& a, const fptype& b )
+  {
+    return fpternary( ( b < a ), a, b );
+  }
+
+  /*
+  inline fptype_v
+  fpmax( const fptype& a, const fptype_v& b )
+  {
+    return fpternary( ( b < a ), a, b );
+  }
+  */
+
+  inline fptype_v
+  fpmin( const fptype_v& a, const fptype_v& b )
+  {
+    return fpternary( ( a < b ), a, b );
+  }
+
+  /*
+  inline fptype_v
+  fpmin( const fptype_v& a, const fptype& b )
+  {
+    return fpternary( ( a < b ), a, b );
+  }
+
+  inline fptype_v
+  fpmin( const fptype& a, const fptype_v& b )
+  {
+    return fpternary( ( a < b ), a, b );
+  }
+  */
+
+  //--------------------------------------------------------------------------
+
+  // Vector wrapper over RRRRIIII floating point vectors (cxtype_v_ref)
+  // The cxtype_v_ref class (a non-const reference to two fptype_v variables) was originally designed for MemoryAccessCouplings.
+  class cxtype_v_ref
+  {
+  public:
+    cxtype_v_ref() = delete;
+    cxtype_v_ref( const cxtype_v_ref& ) = delete;
+    cxtype_v_ref( cxtype_v_ref&& ) = default; // copy refs
+    cxtype_v_ref( fptype_v& r, fptype_v& i )
+      : m_preal( &r ), m_pimag( &i ) {} // copy refs
+    cxtype_v_ref& operator=( const cxtype_v_ref& ) = delete;
+    cxtype_v_ref& operator=( cxtype_v_ref&& c ) = delete;
+    cxtype_v_ref& operator=( const cxtype_v& c )
+    {
+      *m_preal = cxreal( c );
+      *m_pimag = cximag( c );
+      return *this;
+    } // copy values
+    __host__ __device__ operator cxtype_v() const { return cxmake( *m_preal, *m_pimag ); }
+  private:
+    fptype_v *m_preal, *m_pimag; // RRRRIIII
+  };
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+  //--------------------------------------------------------------------------
+
+  // Functions and operators for fptype2_v
+
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+
+  inline fptype2_v
+  fpvmerge( const fptype_v& v1, const fptype_v& v2 )
+  {
+    // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537).
+    // I considered various alternatives, including
+    // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...)
+    // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast?
+    // Probably the best solution is intrinsics?
+    // - see https://stackoverflow.com/questions/5139363
+    // - see https://stackoverflow.com/questions/54518744
+    /*
+    fptype2_v out;
+    for( int ieppV = 0; ieppV < neppV; ieppV++ )
+    {
+      out[ieppV] = v1[ieppV];
+      out[ieppV+neppV] = v2[ieppV];
+    }
+    return out;
+    */
+#if MGONGPU_CPPSIMD == 2
+    fptype2_v out =
+      { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] };
+#elif MGONGPU_CPPSIMD == 4
+    fptype2_v out =
+      { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] };
+#elif MGONGPU_CPPSIMD == 8
+    fptype2_v out =
+      { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] };
+#endif
+    return out;
+  }
+
+  inline fptype_v
+  fpvsplit0( const fptype2_v& v )
+  {
+    /*
+    fptype_v out = {}; // see #594
+    for( int ieppV = 0; ieppV < neppV; ieppV++ )
+    {
+      out[ieppV] = v[ieppV];
+    }
+    */
+#if MGONGPU_CPPSIMD == 2
+    fptype_v out =
+      { (fptype)v[0], (fptype)v[1] };
+#elif MGONGPU_CPPSIMD == 4
+    fptype_v out =
+      { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] };
+#elif MGONGPU_CPPSIMD == 8
+    fptype_v out =
+      { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] };
+#endif
+    return out;
+  }
+
+  inline fptype_v
+  fpvsplit1( const fptype2_v& v )
+  {
+    /*
+    fptype_v out = {}; // see #594
+    for( int ieppV = 0; ieppV < neppV; ieppV++ )
+    {
+      out[ieppV] = v[ieppV+neppV];
+    }
+    */
+#if MGONGPU_CPPSIMD == 2
+    fptype_v out =
+      { (fptype)v[2], (fptype)v[3] };
+#elif MGONGPU_CPPSIMD == 4
+    fptype_v out =
+      { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] };
+#elif MGONGPU_CPPSIMD == 8
+    fptype_v out =
+      { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] };
+#endif
+    return out;
+  }
+
+#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+
+#endif // #ifndef MGONGPUCPP_GPUIMPL
+
+  //==========================================================================
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  //------------------------------
+  // Vector types - CUDA
+  //------------------------------
+
+  // Printout to std::cout for user defined types
+  inline __host__ __device__ void
+  print( const fptype& f )
+  {
+    printf( "%f\n", f );
+  }
+  inline __host__ __device__ void
+  print( const cxtype& c )
+  {
+    printf( "[%f, %f]\n", cxreal( c ), cximag( c ) );
+  }
+
+  /*
+  inline __host__ __device__ const cxtype&
+  cxvmake( const cxtype& c )
+  {
+    return c;
+  }
+  */
+
+  inline __host__ __device__ fptype
+  fpternary( const bool& mask, const fptype& a, const fptype& b )
+  {
+    return ( mask ? a : b );
+  }
+
+  inline __host__ __device__ cxtype
+  cxternary( const bool& mask, const cxtype& a, const cxtype& b )
+  {
+    return ( mask ? a : b );
+  }
+
+  inline __host__ __device__ bool
+  maskand( const bool& mask )
+  {
+    return mask;
+  }
+
+#endif // #ifdef MGONGPUCPP_GPUIMPL
+
+  //==========================================================================
+
+  // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
+#ifdef MGONGPUCPP_GPUIMPL
+  typedef bool bool_sv;
+  typedef fptype fptype_sv;
+  typedef fptype2 fptype2_sv;
+  typedef unsigned int uint_sv;
+  typedef cxtype cxtype_sv;
+  typedef cxtype_ref cxtype_sv_ref;
+#elif defined MGONGPU_CPPSIMD
+  typedef bool_v bool_sv;
+  typedef fptype_v fptype_sv;
+  typedef fptype2_v fptype2_sv;
+  typedef uint_v uint_sv;
+  typedef cxtype_v cxtype_sv;
+  typedef cxtype_v_ref cxtype_sv_ref;
+#else
+  typedef bool bool_sv;
+  typedef fptype fptype_sv;
+  typedef fptype2 fptype2_sv;
+  typedef unsigned int uint_sv;
+  typedef cxtype cxtype_sv;
+  typedef cxtype_ref cxtype_sv_ref;
+#endif
+
+  // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
+#elif defined MGONGPU_CPPSIMD
+  inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
+#else
+  inline cxtype cxzero_sv() { return cxtype( 0, 0 ); }
+#endif /* clang-format on */
+
+  //==========================================================================
+
+  // Functions and operators for cxtype_sv
+  inline __host__ __device__ fptype_sv
+  cxabs2( const cxtype_sv& c )
+  {
+    return cxreal( c ) * cxreal( c ) + cximag( c ) * cximag( c );
+  }
+
+  //==========================================================================
+
+} // end namespace mg5amcGpu/mg5amcCpu
+
+#endif // MGONGPUVECTORS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/nvtx.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/nvtx.h
new file mode 100644
index 0000000000..60c67cb547
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/nvtx.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Aug 2020, based on earlier work by Peter Heywood) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUNVTX_H
+#define MGONGPUNVTX_H 1
+
+// Provides macros for simply use of NVTX, if a compiler macro USE_NVTX is defined.
+// Original author Peter Heywood <p.heywood@sheffield.ac.uk>
+// With a few modifications by Andrea Valassi
+
+//-------------------------------------------
+// NVTX is enabled
+//-------------------------------------------
+
+#ifdef USE_NVTX
+
+#include <stdio.h>
+
+// This assumes CUDA 10.0+
+#include "nvtx3/nvToolsExt.h"
+
+// Scope some things into a namespace
+namespace nvtx
+{
+
+  // Colour palette (RGB): https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=12
+  const uint32_t palette[] = { 0xffa6cee3, 0xff1f78b4, 0xffb2df8a, 0xff33a02c, 0xfffb9a99, 0xffe31a1c, 0xfffdbf6f, 0xffff7f00, 0xffcab2d6, 0xff6a3d9a, 0xffffff99, 0xffb15928 };
+  const uint32_t colourCount = sizeof( palette ) / sizeof( uint32_t );
+
+  // Inline method to push an nvtx range
+  inline void push( const char* str, const uint32_t nextColourIdx )
+  {
+    // Get the wrapped colour index
+    uint32_t colourIdx = nextColourIdx % colourCount;
+    // Build/populate the struct of nvtx event attributes
+    nvtxEventAttributes_t eventAttrib = { 0 }; // zero-out the struct (see https://nvidia.github.io/NVTX/doxygen/structnvtx_event_attributes__v2.html)
+    eventAttrib.version = NVTX_VERSION;
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    eventAttrib.colorType = NVTX_COLOR_ARGB;
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    eventAttrib.color = palette[colourIdx];
+    eventAttrib.message.ascii = str;
+    // Push the custom event.
+    nvtxRangePushEx( &eventAttrib );
+  }
+
+  // Inline method to pop an nvtx range
+  inline void pop()
+  {
+    nvtxRangePop();
+  }
+
+}
+
+// Macro to push an arbitrary nvtx marker
+#define NVTX_PUSH( str, idx ) nvtx::push( str, idx )
+
+// Macro to pop an arbitrary nvtx marker
+#define NVTX_POP() nvtx::pop()
+
+//-------------------------------------------
+// NVTX is not enabled
+//-------------------------------------------
+
+#else
+
+#define NVTX_PUSH( str, idx )
+#define NVTX_POP()
+
+#endif
+
+#endif // MGONGPUNVTX_H 1
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.cc
new file mode 100644
index 0000000000..f2144d8fc6
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.h
new file mode 100644
index 0000000000..ac8bad4d48
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef OMPNUMTHREADS_H
+#define OMPNUMTHREADS_H 1
+
+#ifdef _OPENMP
+
+#include <omp.h>
+
+#include <iostream>
+
+// The OMP_NUM_THREADS environment variable is used to control OMP multi-threading
+// By default, all available $(nproc) threads are used if OMP_NUM_THREADS is not set:
+// if ompnumthreadsNotSetMeansOneThread is called, only one thread is used instead
+inline void
+ompnumthreadsNotSetMeansOneThread( int debuglevel ) // quiet(-1), info(0), debug(1)
+{
+  // Set OMP_NUM_THREADS equal to 1 if it is not yet set
+  char* ompnthr = getenv( "OMP_NUM_THREADS" );
+  if( debuglevel == 1 )
+  {
+    std::cout << "DEBUG: entering ompnumthreadsNotSetMeansOneThread" << std::endl;
+    std::cout << "DEBUG: omp_get_num_threads() = "
+              << omp_get_num_threads() << std::endl; // always == 1 here!
+    std::cout << "DEBUG: omp_get_max_threads() = "
+              << omp_get_max_threads() << std::endl;
+    std::cout << "DEBUG: ${OMP_NUM_THREADS}    = '"
+              << ( ompnthr == 0 ? "[not set]" : ompnthr ) << "'" << std::endl;
+  }
+  if( ompnthr == NULL ||
+      std::string( ompnthr ).find_first_not_of( "0123456789" ) != std::string::npos ||
+      atol( ompnthr ) == 0 )
+  {
+    if( ompnthr != NULL )
+      std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+                << "WARNING! OMP_NUM_THREADS is invalid: will use only 1 thread" << std::endl;
+    else if( debuglevel >= 0 )
+      std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+                << "DEBUG: OMP_NUM_THREADS is not set: will use only 1 thread" << std::endl;
+    omp_set_num_threads( 1 ); // https://stackoverflow.com/a/22816325
+    if( debuglevel == 1 )
+    {
+      std::cout << "DEBUG: omp_get_num_threads() = "
+                << omp_get_num_threads() << std::endl; // always == 1 here!
+      std::cout << "DEBUG: omp_get_max_threads() = "
+                << omp_get_max_threads() << std::endl;
+    }
+  }
+  else if( debuglevel >= 0 )
+    std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+              << "DEBUG: OMP_NUM_THREADS = " << ompnthr << std::endl;
+  if( debuglevel >= 0 )
+    std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+              << "omp_get_max_threads() = " << omp_get_max_threads() << std::endl;
+  if( debuglevel == 1 )
+    std::cout << "DEBUG: exiting ompnumthreadsNotSetMeansOneThread" << std::endl;
+}
+#endif
+
+#endif // OMPNUMTHREADS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/perf.py b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/perf.py
new file mode 100644
index 0000000000..da0cddb788
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/perf.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Roiser (Apr 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Roiser (2020-2024) for the MG5aMC CUDACPP plugin.
+
+from optparse import OptionParser
+from datetime import datetime
+from mpl_toolkits.mplot3d import Axes3D  # noqa: F401
+import matplotlib.pyplot as plt
+from matplotlib import cm
+from matplotlib.ticker import ScalarFormatter
+import numpy as np
+import copy
+import sys
+import json
+from operator import itemgetter
+
+
+class Perf():
+
+    def __init__(self, date, run, x, y, z, xrem, yrem, loc):
+        perffile = '%s/%s-perf-test-run%s.json' % (loc, date, run)
+        data = open(perffile, 'r')
+        readJson = json.loads(data.read())
+        data.close()
+        self.axesn = [x, y, z]
+        self.axesr = [xrem, yrem]  # remove outer bands from axes
+        self.axesv = [[], [], []]
+        self.data = self.prepData(readJson)
+
+    def prepData(self, jsonData):
+        for data in jsonData:
+            for i in data:
+                if isinstance(data[i], type('test')):
+                    idx = -1
+                    if data[i].find("sec") != -1:
+                        idx = data[i].find("sec")
+                    elif data[i].find("GEV") != -1:
+                        idx = data[i].find("GeV")
+
+                    if idx != -1:
+                        data[i] = float(data[i][:idx - 1])
+        return jsonData
+
+    def prepAxes3D(self):
+        for d in self.data:
+            ks = list(d.keys())
+            for ax in self.axesn:
+                idx = self.axesn.index(ax)
+                axlist = self.axesv[idx]
+                if ax in ks:
+                    axval = d[ax]
+                    if axval not in axlist:
+                        axlist.append(axval)
+                else:
+                    print('Error: cannot find axes name %s in %s' % (ax, d))
+        if len(self.axesv[0]) * len(self.axesv[1]) != len(self.axesv[2]):
+            print('Error: axes don\'t match x * y != z (%d * %d != %d' %
+                  (len(self.axesv[0]), len(self.axesv[1]), len(self.axesv[2])))
+        self.axesv[0].sort()
+        self.axesv[1].sort()
+        self.axesv[0] = self.axesv[0][self.axesr[0]:]  # sr
+        self.axesv[1] = self.axesv[1][self.axesr[1]:]  # sr
+
+    def prepData3D(self):
+        xlen = len(self.axesv[0])
+        ylen = len(self.axesv[1])
+        self.data2d = []
+        ylist = [0] * ylen
+        for i in range(xlen):
+            self.data2d.append(copy.deepcopy(ylist))
+        for d in self.data:
+            xpos = -1
+            ypos = -1
+            if d[self.axesn[0]] in self.axesv[0]:
+                xpos = self.axesv[0].index(d[self.axesn[0]])
+            if d[self.axesn[1]] in self.axesv[1]:
+                ypos = self.axesv[1].index(d[self.axesn[1]])
+            if xpos != -1 and ypos != -1:
+                zval = d[self.axesn[2]]
+                self.data2d[xpos][ypos] = zval
+
+    def plot3D(self):
+        self.prepAxes3D()
+        self.prepData3D()
+
+        data_array = np.array(self.data2d)
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        x_data, y_data = np.meshgrid(np.arange(data_array.shape[1]),
+                                     np.arange(data_array.shape[0]))
+        xticks = x_data[0]
+        yticks = np.array(list(range(len(y_data))))
+        x_data = x_data.flatten()
+        y_data = y_data.flatten()
+        z_data = data_array.flatten()
+        ax.set_xlabel(self.axesn[1], {'fontsize': 'small'})
+        ax.set_xticks(xticks)
+        # consider 'fontsize': 'small' for dict also yticklabels
+        ax.set_xticklabels(self.axesv[1], {'rotation': 45, 'fontsize': 'small'})
+        ax.set_ylabel(self.axesn[0], {'fontsize': 'small'})
+        ax.set_yticks(yticks)
+        # consider 'fontsize': 'small' for dict
+        ax.set_yticklabels(self.axesv[0], {'rotation': 45, 'fontsize': 'small'})
+        ax.set_zlabel(self.axesn[2], {'fontsize': 'small'})
+        # ax.set_zscale('log')
+        # z_data = np.log10(z_data)
+        ax.bar3d(x_data, y_data, np.zeros(len(z_data)), 1, 1, z_data)
+        plt.show()
+
+    def prepData2D(self):
+        self.dataDict2D = {}
+        xname = self.axesn[0]
+        yname = self.axesn[1]
+        zname = self.axesn[2]
+
+        for d in self.data:
+            xval = d[xname]
+            yval = d[yname]
+            zval = d[zname]
+            dim = xval * yval
+            tick = '%s/%s' % (str(xval), str(yval))
+            vallist = [float(str(zval).split()[0]), tick]
+            if dim not in self.dataDict2D:
+                self.dataDict2D[dim] = [vallist]
+            else:
+                self.dataDict2D[dim].append(vallist)
+
+    def plot2D(self):
+        self.prepData2D()
+
+        # use this value to plot a flat line for the cpu values to compare with
+        cpuval = 0
+        # cpuval = 79766.84    # tot
+        # cpuval = 427251.1  # rmb + me
+        # cpuval = 472578.7    # me
+
+        cmap = {'32': 'red', '64': 'orange', '128': 'blue', '256': 'green'}
+        smap = {'32': 20, '64': 40, '128': 80, '256': 160}
+
+        dims = list(self.dataDict2D.keys())
+        dims.sort()
+        xlist = list(range(1, len(dims) + 1))
+        ylist = []
+        clist = []
+        slist = []
+        ylabels = []
+        for d in dims:
+            ysublist = []
+            for y in self.dataDict2D[d]:
+                ysublist.append(y)  # y[0]
+            ysublist = sorted(ysublist, key=itemgetter(0), reverse=True)
+            clist.append([cmap[x[1].split('/')[0]] for x in ysublist])
+            slist.append([smap[x[1].split('/')[0]] for x in ysublist])
+            # Temporary conversion for total time for events -> events per sec
+            # ysublist[0][0] = d / ysublist[0][0]
+            ylabels.append([x[1] for x in ysublist])
+            ylist.append([x[0] for x in ysublist])
+
+        fig, ax = plt.subplots()
+        print(xlist)
+        print(ylist)
+        for xe, ye, ce, se in zip(xlist, ylist, clist, slist):
+            print([xe] * len(ye))
+            ax.scatter([xe] * len(ye), ye, s=se, facecolors='none',
+                       edgecolors=ce)
+            if cpuval:
+                ax.scatter(xe, cpuval, marker='+', c='dimgrey')
+
+        ax.set_xticks(xlist)
+        ax.set_xlabel('%s * %s' % (self.axesn[0], self.axesn[1]))
+        ax.set_ylabel('%s' % (self.axesn[2]))
+        ax.set_yscale('log')
+        ax.set_xticklabels(dims, rotation=45)
+        ax.yaxis.set_major_formatter(ScalarFormatter())
+        plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
+        # Commenting only for the current example due to an overlap of the
+        # product labels
+        # xpos = 1
+        # for y in ylabels:
+        #     xstr = ''
+        #     for x in y:
+        #         # xstr += x.replace('/', '\n')
+        #         xstr += x
+        #         xstr += '\n'
+        #     ax.text(xpos, 1, xstr, {'fontsize': 'xx-small',
+        #                             'ha': 'center',
+        #                             'va': 'bottom'})
+        #     xpos += 1
+
+        handlelist = []
+        for k in cmap:
+            handlelist.append(plt.scatter([], [], s=smap[k], marker='o',
+                                          color=cmap[k], facecolor='none'))
+
+        print(handlelist)
+        plt.legend(handlelist, [str(x) for x in cmap.keys()],
+                   title="# threads / block")
+
+        plt.show()
+
+    def plotStack(self, threads=32):
+        collist = ['Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'Greys']
+        # collist = ['tab20b', 'tab20c']
+
+        bars = {}
+        blocks = []
+        for d in self.data:
+            if d['NumThreadsPerBlock'] == threads:
+                blocks.append(d['NumBlocksPerGrid'])
+                for k in d:
+                    if k[0].isdigit():
+                        if k not in bars:
+                            bars[k] = []
+
+        barks = list(bars.keys())
+        barks.sort()
+        blocks.sort()
+
+        for d in self.data:
+            if d['NumThreadsPerBlock'] == threads:
+                for b in barks:
+                    if b in d:
+                        bars[b].append(d[b])
+                    else:
+                        bars[b].append(0)
+
+        ind = np.arange(len(bars[barks[0]]))
+        width = 0.35
+
+        plts = []
+        ci = -1
+        cj = 0.5
+        plts.append(plt.bar(ind, bars[barks[0]], width, edgecolor='black',
+                            color='white'))
+        bot = [0] * len(bars[barks[0]])
+        for i in range(1, len(barks)):
+            colcod = barks[i][:2]
+            if colcod[1] == 'a':
+                ci += 1
+                cj = 0.5
+            else:
+                cj += 0.1
+            print(colcod, ci, cj, bot[-1], barks[i])
+            col = cm.get_cmap(collist[ci])(cj)
+            sumlist = []
+            for (l1, l2) in zip(bot, bars[barks[i - 1]]):
+                sumlist.append(l1 + l2)
+            bot = sumlist
+            plts.append(plt.bar(ind, bars[barks[i]], width,
+                        bottom=bot, color=col, edgecolor=col))
+
+        plt.ylabel('seconds')
+        plts.reverse()
+        barks.reverse()
+        plt.xticks(ind, [str(x) for x in blocks], rotation=45)
+        plt.legend([x[0] for x in plts], barks)
+
+        plt.show()
+
+
+# import numpy as np
+# import matplotlib.pyplot as plt
+#
+# N = 5
+# menMeans = (20, 35, 30, 35, 27)
+# womenMeans = (25, 32, 34, 20, 25)
+# menStd = (2, 3, 4, 1, 2)
+# womenStd = (3, 5, 2, 3, 3)
+# ind = np.arange(N)    # the x locations for the groups
+# width = 0.35       # the width of the bars: can also be len(x) sequence
+#
+# p1 = plt.bar(ind, menMeans, width, yerr=menStd)
+# p2 = plt.bar(ind, womenMeans, width,
+#              bottom=menMeans, yerr=womenStd)
+#
+# plt.ylabel('Scores')
+# plt.title('Scores by group and gender')
+# plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
+# plt.yticks(np.arange(0, 81, 10))
+# plt.legend((p1[0], p2[0]), ('Men', 'Women'))
+#
+# plt.show()
+
+def print_keys(loc, date, run):
+    perffile = '%s/%s-perf-test-run%s.json' % (loc, date, run)
+    data = open(perffile, 'r')
+    readJson = json.loads(data.read())
+    data.close()
+    for k in list(readJson[0].keys()):
+        print(k)
+
+
+if __name__ == '__main__':
+
+    n = datetime.now()
+    today = str(n.year) + str(n.month).rjust(2, '0') + str(n.day).rjust(2, '0')
+    parser = OptionParser()
+    parser.add_option('-l', '--location', dest='dir', default='data',
+                      help='directory with data (default: data)')
+    parser.add_option('-d', '--date', dest='date', default=today,
+                      help='date of data files YYYYMMDD (default: today)')
+    parser.add_option('-r', '--run', default='1', dest='run',
+                      help='run number (default: 1)')
+    parser.add_option('-x', dest='xax', default='NumThreadsPerBlock',
+                      help='variable name for x axis \
+                            (default: NumThreadsPerBlock)')
+    parser.add_option('-y', dest='yax', default='NumBlocksPerGrid',
+                      help='variable name for y axis \
+                            (default: NumBlocksPerGrid)')
+    parser.add_option('-z', dest='zax', default='TotalTimeInWaveFuncs',
+                      help='variable name for z axis \
+                            (default: TotalTimeInWaveFuncs)')
+    parser.add_option('--xrm', dest='xrm', default=0,
+                      help='# of outer x dimensions to remove')
+    parser.add_option('--yrm', dest='yrm', default=0,
+                      help='# of outer y dimensions to remove')
+    parser.add_option('-k', '--keys', dest='keys', action='store_true',
+                      help='print available keys from data')
+
+    (op, ar) = parser.parse_args()
+
+    plotnames = ['2D', '3D', 'STACK']
+    plot = '2D'
+
+    xrm = 0
+    yrm = 0
+    if op.xrm:
+        xrm = int(op.xrm)
+    if op.yrm:
+        yrm = int(op.yrm)
+
+    if op.keys:
+        print_keys(op.dir, op.date, op.run)
+        sys.exit(0)
+
+    if (len(ar) == 1 and ar[0].upper() not in plotnames) or len(ar) > 1:
+        print(parser.print_help())
+        sys.exit(1)
+    elif len(ar) == 1:
+        plot = ar[0].upper()
+
+    p = Perf(op.date, op.run, op.xax, op.yax, op.zax, xrm, yrm, op.dir)
+    if plot == '3D':
+        p.plot3D()
+    if plot == '2D':
+        p.plot2D()
+    if plot == 'STACK':
+        p.plotStack()
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h
new file mode 100644
index 0000000000..a4777347d0
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h
@@ -0,0 +1,16 @@
+// Copyright (C) 2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: ... for the MG5aMC CUDACPP plugin.
+
+
+#ifndef MG5_CONFIG_%(processid_uppercase)s_H
+#define MG5_CONFIG_%(processid_uppercase)s_H 1
+
+namespace processConfig {
+
+  constexpr int ndiagrams = %(ndiagrams)d;
+
+}
+
+#endif // MG5_CONFIG_%(processid_uppercase)s_H
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
new file mode 100644
index 0000000000..a52e12fc4c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -0,0 +1,92 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+%(info_lines)s
+//==========================================================================
+
+#include "CPPProcess.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "GpuRuntime.h"
+%(hel_amps_h)s
+#include "MemoryAccessAmplitudes.h"
+#include "MemoryAccessIflavorVec.h"
+#include "MemoryAccessChannelIds.h"
+#include "MemoryAccessCouplings.h"
+#include "MemoryAccessCouplingsFixed.h"
+#include "MemoryAccessGs.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
+#include "processConfig.h"
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#include "MemoryAccessDenominators.h"
+#include "MemoryAccessNumerators.h"
+#include "coloramps.h"
+#endif
+
+#include <algorithm>
+#include <array>
+#include <cfenv>  // for feenableexcept, fegetexcept and FE_XXX
+#include <cfloat> // for FLT_MIN
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+
+// Test ncu metrics for CUDA thread divergence
+#undef MGONGPU_TEST_DIVERGENCE
+//#define MGONGPU_TEST_DIVERGENCE 1
+
+//--------------------------------------------------------------------------
+
+// Enable FPE traps (see #701, #733, #831 - except on MacOS where feenableexcept is not defined #730)
+// [NB1: Fortran default is -ffpe-trap=none, i.e. FPE traps are not enabled, https://gcc.gnu.org/onlinedocs/gfortran/Debugging-Options.html]
+// [NB2: Fortran default is -ffpe-summary=invalid,zero,overflow,underflow,denormal, i.e. warn at the end on STOP]
+inline void
+fpeEnable()
+{
+  static bool first = true; // FIXME: quick and dirty hack to do this only once (can be removed when separate C++/CUDA builds are implemented)
+  if( !first ) return;
+  first = false;
+#ifndef __APPLE__ // on MacOS feenableexcept is not defined #730
+  //int fpes = fegetexcept();
+  //std::cout << "fpeEnable: analyse fegetexcept()=" << fpes << std::endl;
+  //std::cout << "fpeEnable:     FE_DIVBYZERO is" << ( ( fpes & FE_DIVBYZERO ) ? " " : " NOT " ) << "enabled" << std::endl;
+  //std::cout << "fpeEnable:     FE_INEXACT is" << ( ( fpes & FE_INEXACT ) ? " " : " NOT " ) << "enabled" << std::endl;
+  //std::cout << "fpeEnable:     FE_INVALID is" << ( ( fpes & FE_INVALID ) ? " " : " NOT " ) << "enabled" << std::endl;
+  //std::cout << "fpeEnable:     FE_OVERFLOW is" << ( ( fpes & FE_OVERFLOW ) ? " " : " NOT " ) << "enabled" << std::endl;
+  //std::cout << "fpeEnable:     FE_UNDERFLOW is" << ( ( fpes & FE_UNDERFLOW ) ? " " : " NOT " ) << "enabled" << std::endl;
+  constexpr bool enableFPE = true; // this is hardcoded and no longer controlled by getenv( "CUDACPP_RUNTIME_ENABLEFPE" )
+  if( enableFPE )
+  {
+    std::cout << "INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW" << std::endl;
+    feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW ); // new strategy #831 (do not enable FE_UNDERFLOW)
+    //fpes = fegetexcept();
+    //std::cout << "fpeEnable: analyse fegetexcept()=" << fpes << std::endl;
+    //std::cout << "fpeEnable:     FE_DIVBYZERO is" << ( ( fpes & FE_DIVBYZERO ) ? " " : " NOT " ) << "enabled" << std::endl;
+    //std::cout << "fpeEnable:     FE_INEXACT is" << ( ( fpes & FE_INEXACT ) ? " " : " NOT " ) << "enabled" << std::endl;
+    //std::cout << "fpeEnable:     FE_INVALID is" << ( ( fpes & FE_INVALID ) ? " " : " NOT " ) << "enabled" << std::endl;
+    //std::cout << "fpeEnable:     FE_OVERFLOW is" << ( ( fpes & FE_OVERFLOW ) ? " " : " NOT " ) << "enabled" << std::endl;
+    //std::cout << "fpeEnable:     FE_UNDERFLOW is" << ( ( fpes & FE_UNDERFLOW ) ? " " : " NOT " ) << "enabled" << std::endl;
+  }
+  else
+  {
+    //std::cout << "INFO: Do not enable SIGFPE traps for Floating Point Exceptions" << std::endl;
+  }
+#else
+  //std::cout << "INFO: Keep default SIGFPE settings because feenableexcept is not available on MacOS" << std::endl;
+#endif
+}
+
+%(process_function_definitions)s
+//==========================================================================
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
new file mode 100644
index 0000000000..bc9a146553
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
@@ -0,0 +1,84 @@
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+! Further modified by: D. Massaro, A. Thete, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+  //==========================================================================
+  // A class for calculating the matrix elements for
+  %(process_lines)s
+  //--------------------------------------------------------------------------
+
+  class CPPProcess
+  {
+  public: /* clang-format off */
+
+    // Constructor (from command line arguments)
+    CPPProcess( bool verbose = false, bool debug = false );
+
+    // Destructor
+    ~CPPProcess();
+
+    // Initialize process (read model parameters from file)
+    virtual void initProc( const std::string& param_card_name );
+
+    // Retrieve the compiler that was used to build this module
+    static const std::string getCompiler();
+
+    // Other methods of this instance (???)
+    //const std::vector<fptype>& getMasses() const { return m_masses; }
+    //virtual int code() const{ return 1; }
+    //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; }
+    //int getDim() const { return dim; }
+    //int getNIOParticles() const { return nexternal; } // nexternal was nioparticles
+
+    // Accessors (unused so far: add four of them only to fix a clang build warning)
+    //bool verbose() const { return m_verbose; }
+    bool debug() const { return m_debug; }
+
+  public:
+
+    // Process-independent compile-time constants
+    static constexpr int np4 = 4; // dimensions of 4-momenta (E,px,py,pz)
+    static constexpr int nw6 = 4; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+
+    // Process-dependent compile-time constants
+    static constexpr int npari = %(nincoming)d; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu-
+    static constexpr int nparf = %(noutcoming)d; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu-
+    static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+    static constexpr int ncomb = %(nbhel)d; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+    static constexpr int ndiagrams = %(ndiagrams)d; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = %(ncolor)s; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
+    static constexpr int nmaxflavor = %(nmaxflavor)d; // the maximum number of flavor combinations
+
+    // Hardcoded parameters for this process (constant class variables)
+    // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
+    // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
+    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
+    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+
+    // Other variables of this instance (???)
+    //static const int ninitial = CPPProcess::npari;
+    //static const int nexternal = %(nexternal)d; // CPPProcess::npar (nexternal was nioparticles)
+    //static const int nwavefuncs = %(nwavefuncs)d; // (?!?! this should be nwf but export_cpp gives the wrong value here)
+    //static const int namplitudes = %(namp)d;
+    //static const int ncomb = %(ncomb)d; // CPPProcess::ncomb
+
+  private: /* clang-format on */
+
+    // Command line arguments (constructor)
+    bool m_verbose;
+    bool m_debug;
+
+    // Physics model parameters to be read from file (initProc function)
+#ifndef MGONGPU_HARDCODE_PARAM
+    Parameters* m_pars;
+#endif
+    std::vector<fptype> m_masses; // external particle masses
+
+    // Other variables of this instance (???)
+    //int id1, id2; // initial particle ids
+    //cxtype** amp; // ???
+  };
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
new file mode 100644
index 0000000000..95f7269b2c
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -0,0 +1,877 @@
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2025 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+//==========================================================================
+// Class member functions for calculating the matrix elements for
+%(process_lines)s
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
+#ifndef MGONGPUCPP_GPUIMPL
+                                               ,
+                                               const int ievt00,
+                                               bool sanityCheckMixedPrecision = true
+#endif
+  )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPUCPP_GPUIMPL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#else // Cuda or C++
+    using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
+    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
+    // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+    if( allChannelIds != nullptr )
+    {
+      // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV)
+      const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911
+      uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds );                           // fix #895 (compute this only once for all diagrams)
+#ifndef MGONGPU_CPPSIMD
+      // NB: channelIds_sv is a scalar in no-SIMD C++
+      channelId = channelIds_sv;
+#else
+      // NB: channelIds_sv is a vector in SIMD C++
+      channelId = channelIds_sv[0];    // element[0]
+      for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1]
+      {
+        assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId
+      }
+#endif
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+      if( sanityCheckMixedPrecision )
+      {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
+        const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
+        uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
+        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        for( int i = 0; i < neppV; ++i )
+        {
+          assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
+        }
+#endif
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return channelId;
+  }
+#endif // MGONGPU_SUPPORTS_MULTICHANNEL
+
+  constexpr int np4 = CPPProcess::np4;     // dimensions of 4-momenta (E, px, py, pz)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+  constexpr int nmaxflavor = CPPProcess::nmaxflavor; // the maximum number of flavor combinations
+
+  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
+  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+
+  using Parameters_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
+  using Parameters_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
+
+  // The number of SIMD vectors of events processed by calculate_jamps
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  constexpr int nParity = 2;
+#else
+  constexpr int nParity = 1;
+#endif
+
+  // Physics parameters (masses, coupling, etc...)
+  // For CUDA performance, hardcoded constexpr's would be better: fewer registers and a tiny throughput increase
+  // However, physics parameters are user-defined through card files: use CUDA constant memory instead (issue #39)
+  // [NB if hardcoded parameters are used, it's better to define them here to avoid silent shadowing (issue #263)]
+  constexpr int nIPD = %(nipd)i; // SM independent parameters used in this CPPProcess.cc (FIXME? rename as sm_IndepParam?)
+  // Note: in the Python code generator, nIPD == nparam, while nIPC <= nicoup, because (see #823)
+  // nIPC may vary from one P*/CPPProcess.cc to another, while nicoup is defined in src/Param.h and is common to all P*
+  constexpr int nIPC = %(nipc)i; // SM independent couplings used in this CPPProcess.cc (FIXME? rename as sm_IndepCoupl?)
+  // nIPF are the number of SM independent flavor couplings, of type FLV_COUPLING
+  constexpr int nMF = FLV_COUPLING::max_flavor;
+  constexpr int nIPF = %(nipf)i;
+  static_assert( nIPC <= nicoup );
+  static_assert( nIPD >= 0 ); // Hack to avoid build warnings when nIPD==0 is unused
+  static_assert( nIPC >= 0 ); // Hack to avoid build warnings when nIPC==0 is unused
+  static_assert( nMF >= 0 );  // Hack to avoid build warnings when nMF ==0 is unused
+  static_assert( nIPF >= 0 ); // Hack to avoid build warnings when nIPF==0 is unused
+#ifdef MGONGPU_HARDCODE_PARAM
+  %(cipdhrdcod)s
+  %(cipchrdcod)s
+  %(cipfhrdcod)s
+#else
+#ifdef MGONGPUCPP_GPUIMPL
+  %(cipddevice)s
+  %(cipcdevice)s
+  %(cipfdevice)s
+#else
+  %(cipdstatic)s
+  %(cipcstatic)s
+  %(cipfstatic)s
+#endif
+#endif
+
+  // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0
+  // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated
+  // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0)
+#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0
+#ifdef MGONGPU_HARDCODE_PARAM
+  __device__ const double* bsmIndepParam = Parameters::mdl_bsmIndepParam;
+#else
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ __constant__ double bsmIndepParam[Parameters::nBsmIndepParam];
+#else
+  static double bsmIndepParam[Parameters::nBsmIndepParam];
+#endif
+#endif
+#else
+#ifdef MGONGPU_HARDCODE_PARAM
+  __device__ const double* bsmIndepParam = nullptr;
+#else
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ __constant__ double* bsmIndepParam = nullptr;
+#else
+  static double* bsmIndepParam = nullptr;
+#endif
+#endif
+#endif
+
+  // Helicity combinations (and filtering of "good" helicity combinations)
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ __constant__ short cHel[ncomb][npar];
+  __device__ __constant__ int cNGoodHel;
+  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
+  __device__ __constant__ short cFlavors[nmaxflavor][npar];
+#else
+  static short cHel[ncomb][npar];
+  static short cFlavors[nmaxflavor][npar];
+#endif
+  static int cNGoodHel;
+  static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+%(all_sigmaKin)s
+  //--------------------------------------------------------------------------
+
+  CPPProcess::CPPProcess( bool verbose,
+                          bool debug )
+    : m_verbose( verbose )
+    , m_debug( debug )
+#ifndef MGONGPU_HARDCODE_PARAM
+    , m_pars( 0 )
+#endif
+    , m_masses()
+  {
+    // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283]
+    // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 ***
+%(all_helicities)s
+%(all_flavors)s
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) );
+    gpuMemcpyToSymbol( cFlavors, tFlavors, nmaxflavor * npar * sizeof( short ) );
+#else
+    memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
+    memcpy( cFlavors, tFlavors, nmaxflavor * npar * sizeof( short ) );
+#endif
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+  CPPProcess::~CPPProcess() {}
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPU_HARDCODE_PARAM
+  // Initialize process (with parameters read from user cards)
+  void
+  CPPProcess::initProc( const std::string& param_card_name )
+  {
+    // Instantiate the model class and set parameters that stay fixed during run
+    m_pars = Parameters::getInstance();
+    SLHAReader slha( param_card_name, m_verbose );
+    m_pars->setIndependentParameters( slha );
+    m_pars->setIndependentCouplings();
+    //m_pars->setDependentParameters(); // now computed event-by-event (running alphas #373)
+    //m_pars->setDependentCouplings(); // now computed event-by-event (running alphas #373)
+    if( m_verbose )
+    {
+      m_pars->printIndependentParameters();
+      m_pars->printIndependentCouplings();
+      //m_pars->printDependentParameters(); // now computed event-by-event (running alphas #373)
+      //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373)
+    }
+    %(initProc_lines)s
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
+    // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
+    // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
+    %(cipdassign)s
+    %(cipcassign)s
+    %(cipfassign)s
+#ifdef MGONGPUCPP_GPUIMPL
+    %(cipd2tipdSym)s
+    %(cipc2tipcSym)s
+    %(cipf2tipfSym)s
+#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0
+    if( Parameters::nBsmIndepParam > 0 )
+      gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters::nBsmIndepParam * sizeof( double ) );
+#endif
+#else
+    %(cipd2tipd)s
+    %(cipc2tipc)s
+    %(cipf2tipf)s
+#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0
+    if( Parameters::nBsmIndepParam > 0 )
+      memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters::nBsmIndepParam * sizeof( double ) );
+#endif
+#endif%(cipddump)s%(cipcdump)s%(cipfdump)s
+    //for ( int i=0; i<Parameters::nBsmIndepParam; i++ ) std::cout << std::setprecision(17) << "m_pars->mdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl;
+  }
+#else
+  // Initialize process (with hardcoded parameters)
+  void
+  CPPProcess::initProc( const std::string& /*param_card_name*/ )
+  {
+    // Use hardcoded physics parameters
+    if( m_verbose )
+    {
+      Parameters::printIndependentParameters();
+      Parameters::printIndependentCouplings();
+      //Parameters::printDependentParameters(); // now computed event-by-event (running alphas #373)
+      //Parameters::printDependentCouplings(); // now computed event-by-event (running alphas #373)
+    }
+    %(hardcoded_initProc_lines)s
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Retrieve the compiler that was used to build this module
+  const std::string
+  CPPProcess::getCompiler()
+  {
+    std::stringstream out;
+    // HIP version (HIPCC)
+    // [Use __HIPCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [This tests if 'hipcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
+    // [Check 'hipcc -dM -E -x hip -I ../../src CPPProcess.cc | grep HIP']
+#ifdef __HIPCC__
+#if defined HIP_VERSION_MAJOR && defined HIP_VERSION_MINOR && defined HIP_VERSION_PATCH
+    out << "hipcc " << HIP_VERSION_MAJOR << "." << HIP_VERSION_MINOR << "." << HIP_VERSION_PATCH;
+#else
+    out << "hipcc UNKNOWN";
+#endif
+    out << " (";
+#endif
+    // CUDA version (NVCC)
+    // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!]
+    // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
+    // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
+#ifdef __NVCC__
+#if defined __CUDACC_VER_MAJOR__ && defined __CUDACC_VER_MINOR__ && defined __CUDACC_VER_BUILD__
+    out << "nvcc " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__ << "." << __CUDACC_VER_BUILD__;
+#else
+    out << "nvcc UNKNOWN";
+#endif
+    out << " (";
+#endif
+    // ICX version (either as CXX or as host compiler inside NVCC)
+#if defined __INTEL_COMPILER
+#error "icc is no longer supported: please use icx"
+#elif defined __INTEL_LLVM_COMPILER // alternative: __INTEL_CLANG_COMPILER
+    out << "icx " << __INTEL_LLVM_COMPILER;
+#ifdef __NVCC__
+    out << ", ";
+#else
+    out << " (";
+#endif
+#endif
+    // CLANG version (either as CXX or as host compiler inside NVCC or inside ICX)
+#if defined __clang__
+#if defined __clang_major__ && defined __clang_minor__ && defined __clang_patchlevel__
+#ifdef __APPLE__
+    out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__;
+#else
+    out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__;
+    /*
+    // === AV 26-Jan-2024 DISABLE THIS CODE (START)
+    // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build!
+    // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime!
+    // GCC toolchain version inside CLANG
+    std::string tchainout;
+    std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'";
+    std::unique_ptr<FILE, decltype( &pclose )> tchainpipe( popen( tchaincmd.c_str(), "r" ), pclose );
+    if( !tchainpipe ) throw std::runtime_error( "`readelf ...` failed?" );
+    std::array<char, 128> tchainbuf;
+    while( fgets( tchainbuf.data(), tchainbuf.size(), tchainpipe.get() ) != nullptr ) tchainout += tchainbuf.data();
+    tchainout.pop_back(); // remove trailing newline
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
+    out << ", gcc " << tchainout;
+#else
+    out << " (gcc " << tchainout << ")";
+#endif
+    // === AV 26-Jan-2024 DISABLE THIS CODE (END)
+    */
+#endif
+#else
+    out << "clang UNKNOWKN";
+#endif
+#else
+    // GCC version (either as CXX or as host compiler inside NVCC)
+#if defined __GNUC__ && defined __GNUC_MINOR__ && defined __GNUC_PATCHLEVEL__
+    out << "gcc " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
+#else
+    out << "gcc UNKNOWKN";
+#endif
+#endif
+#if defined __HIPCC__ or defined __NVCC__ or defined __INTEL_LLVM_COMPILER
+    out << ")";
+#endif
+    return out.str();
+  }
+
+  //--------------------------------------------------------------------------
+
+  __global__ void /* clang-format off */
+  computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
+                             fptype* allcouplings // output: couplings[nevt*ndcoup*2]
+#ifndef MGONGPUCPP_GPUIMPL
+                             , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif
+  ) /* clang-format on */
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    using namespace mg5amcGpu;
+    using G_ACCESS = DeviceAccessGs;
+    using C_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+#else
+    using namespace mg5amcCpu;
+    using G_ACCESS = HostAccessGs;
+    using C_ACCESS = HostAccessCouplings;
+    for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
+    {
+      const int ievt0 = ipagV * neppV;
+      const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
+      fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
+      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+    }
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
+  sigmaKin_getGoodHel( const fptype* allmomenta,       // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
+                       const unsigned int* iflavorVec, // input: indices of the flavor combinations
+                       fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       fptype_sv* allJamps,            // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       bool* isGoodHel,                // output: isGoodHel[ncomb] - host array
+                       const int nevt )                // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
+    for( int ihel = 0; ihel < ncomb; ihel++ )
+    {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
+      constexpr int nOneHel = 1; // use a jamp buffer for a single helicity
+      gpuMemcpyToSymbol( dcNGoodHel, &nOneHel, sizeof( int ) );
+      // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
+#else
+      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, gpublocks * gputhreads );
+#endif
+      gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
+      {
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
+      }
+    }
+  }
+#else
+  void
+  sigmaKin_getGoodHel( const fptype* allmomenta,       // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
+                       const unsigned int* iflavorVec, // input: indices of the flavor combinations
+                       fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       bool* isGoodHel,                // output: isGoodHel[ncomb] - host array
+                       const int nevt )                // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
+    constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
+    // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
+    assert( nevt >= neppV );
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
+    // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    const int npagV = maxtry / neppV;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
+    // Mixed fptypes #537: float for color algebra and double elsewhere
+    // Delay color algebra and ME updates (only on even pages)
+    assert( npagV %% 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
+#else
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
+    for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
+    {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
+      const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
+#else
+      const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
+      for( int ihel = 0; ihel < ncomb; ihel++ )
+      {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+        // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allMEs[ievt] = 0;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allMEs[ievt2] = 0;
+#endif
+        }
+        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
+#else
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+#endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
+          if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+          {
+            //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+            isGoodHel[ihel] = true;
+          }
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+          {
+            //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+            isGoodHel[ihel] = true;
+          }
+#endif
+        }
+      }
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  int                                          // output: nGoodHel (the number of good helicity combinations out of ncomb)
+  sigmaKin_setGoodHel( const bool* isGoodHel ) // input: isGoodHel[ncomb] - host array (CUDA and C++)
+  {
+    int nGoodHel = 0;
+    int goodHel[ncomb] = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
+    for( int ihel = 0; ihel < ncomb; ihel++ )
+    {
+      //std::cout << "sigmaKin_setGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+      if( isGoodHel[ihel] )
+      {
+        goodHel[nGoodHel] = ihel;
+        nGoodHel++;
+      }
+    }
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
+    cNGoodHel = nGoodHel;
+    for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  __device__ int
+  broken_symmetry_factor( const int iflavor )
+  {
+    int old_factor = %(get_old_symmmetry_value)s;
+    %(get_pid)s
+    const int nincoming = %(nincoming)d;
+    const int nexternal = CPPProcess::npar;
+    for (int i = 0; i < (nexternal - nincoming); i++)
+    {
+      if(pid[i] == 0)
+        continue;
+      int n_tot = 1;
+      for (int j = i + 1; j < (nexternal - nincoming); j++)
+      {
+        if((pid[i] == pid[j]) && (cFlavors[iflavor][i] == cFlavors[iflavor][j]))
+        {
+          pid[j] = 0;
+          n_tot = n_tot + 1;
+          old_factor = old_factor / n_tot;
+        }
+      }
+    }
+    return old_factor;
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+                    const unsigned int* iflavorVec,
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                    bool storeChannelWeights,           // if true, compute final multichannel weights
+                    bool mulChannelWeight,             // if true, multiply matrix element by channel weight
+#endif
+                    const fptype globaldenom) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= (globaldenom * broken_symmetry_factor(iflavorVec[ievt]));
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+        fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams;
+        fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams;
+        for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag )
+        {
+          firstNumerator[idiag] += hAllNumerators[idiag];
+        }
+      }
+      if( mulChannelWeight )
+      {
+        unsigned int channelId = allChannelIds[ievt];
+        allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
+      }
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
+                       unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
+                       const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+                       const fptype* allrnddiagram,       // input: random numbers[nevt] for diagram selection
+                       const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+                       const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+                       const fptype* allNumerators,       // input: all numerators
+                       const fptype* allDenominators,     // input: all denominators
+                       const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+
+    // Event-by-event random choice of channel
+    if( allrnddiagram != nullptr )
+    {
+      fptype numerator_sum = 0., normalization = 0.;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        normalization += allNumerators[ievt * processConfig::ndiagrams + ichan];
+      }
+      channelId = mgOnGpu::nchannels;
+      for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+      {
+        if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+        numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan];
+        if( allrnddiagram[ievt] < numerator_sum / normalization )
+        {
+          channelId = ichan + 1;
+          break;
+        }
+      }
+      allDiagramIdsOut[ievt] = channelId;
+    }
+
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+  // Evaluate |M|^2, part independent of incoming flavour
+
+  void /* clang-format off */
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const unsigned int* iflavorVec,     // input: indices of the flavor combinations
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
+#endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads                // input: cuda gputhreads
+#else
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
+#endif
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif
+            ) /* clang-format on */
+  {
+    mgDebugInitialise();
+
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754)
+    // These variable are not used anywhere else in the code and their scope is limited to this sanity check
+    {
+      // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
+      constexpr int nprocesses = %(nproc)i;
+      static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
+      constexpr int process_id = %(proc_id)i; // code generation source: %(proc_id_source)s
+      static_assert( process_id == 1, "Assume process_id == 1" );
+    }
+
+    // Denominators: spins, colors and identical particles
+    constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343)
+
+#ifndef MGONGPUCPP_GPUIMPL
+    //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
+#endif
+#endif
+
+    // Start sigmaKin_lines
+%(sigmaKin_lines)s
+} // end namespace
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
new file mode 100644
index 0000000000..d9a6584097
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -0,0 +1,137 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Thete, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+%(info_lines)s
+//==========================================================================
+
+#ifndef MG5_%(process_file_name)s_H
+#define MG5_%(process_file_name)s_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "GpuAbstraction.h"
+#include "Parameters.h"
+
+#include <vector>
+
+//--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+%(process_class_definitions)s
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
+                             fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
+#else
+  __global__ void
+  computeDependentCouplings( const fptype* allgs,  // input: Gs[nevt]
+                             fptype* allcouplings, // output: couplings[nevt*ndcoup*2]
+                             const int nevt );     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
+  sigmaKin_getGoodHel( const fptype* allmomenta,       // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
+                       const unsigned int* iflavorVec, // input: index of the flavor combination
+                       fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       fptype_sv* allJamps,            // output: jamp[ncolor*2*nevt]
+                       bool* isGoodHel,                // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+  void
+  sigmaKin_getGoodHel( const fptype* allmomenta,       // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
+                       const unsigned int* iflavorVec, // input: index of the flavor combination
+                       fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       bool* isGoodHel,                // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       const int nevt );               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif /* clang-format on */
+
+  //--------------------------------------------------------------------------
+
+  int                                           // output: nGoodHel (the number of good helicity combinations out of ncomb)
+  sigmaKin_setGoodHel( const bool* isGoodHel ); // input: isGoodHel[ncomb] - host array
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const unsigned int* iflavorVec,     // input: index of the flavor combination
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
+#endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+            gpuBlasHandle_t* pBlasHandle,       // input: cuBLAS/hipBLAS handle
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const unsigned int* iflavorVec,     // input: index of the flavor combination
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
+            const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
+#endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
+            unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
+            bool mulChannelWeight,              // if true, multiply channel weight to ME output
+#endif
+            const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif /* clang-format on */
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // MG5_%(process_file_name)s_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
new file mode 100644
index 0000000000..aac7506855
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -0,0 +1,50 @@
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+
+      // *** COLOR CHOICE BELOW ***
+
+      // Store the leading color flows for choice of color
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifndef MGONGPUCPP_GPUIMPL
+      if( jamp2_sv ) // disable color choice if nullptr
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
+      }
+#else /* clang-format off */
+      assert( iParity == 0 ); // sanity check for J2_ACCESS
+      using J2_ACCESS = DeviceAccessJamp2;
+      if( colAllJamp2s ) // disable color choice if nullptr
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+          // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+          atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
+      }
+#endif /* clang-format on */
+#endif
+
+      // *** PREPARE OUTPUT JAMPS ***
+#ifdef MGONGPUCPP_GPUIMPL
+      //printf( "calculate_jamps: dcNGoodHel=%%d\n", dcNGoodHel );
+      // In CUDA, copy the local jamp to the output global-memory jamp
+      constexpr int ihel0 = 0; // the allJamps buffer already points to a specific helicity _within a super-buffer for dcNGoodHel helicities_
+      using J_ACCESS = DeviceAccessJamp;
+      for( int icol = 0; icol < ncolor; icol++ )
+        J_ACCESS::kernelAccessIcolIhelNhel( allJamps, icol, ihel0, dcNGoodHel ) = jamp_sv[icol];
+#else
+      // In C++, copy the local jamp to the output array passed as function argument
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJamp_sv[iParity * ncolor + icol] = jamp_sv[icol];
+#endif
+    }
+    // END LOOP ON IPARITY
+
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
new file mode 100644
index 0000000000..290efed541
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -0,0 +1,333 @@
+! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
+!==========================================================================
+! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Licensed under the GNU Lesser General Public License (version 3 or later).
+! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+!==========================================================================
+#include "GpuAbstraction.h"
+
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
+    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
+#ifdef MGONGPUCPP_GPUIMPL
+    // *** PART 0a - CUDA ***
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+#endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
+#else
+    // *** PART 0b - C++ ***
+    const int npagV = nevt / neppV;
+    for( int ipagV = 0; ipagV < npagV; ++ipagV )
+    {
+      const int ievt0 = ipagV * neppV;
+      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+      MEs_sv = fptype_sv{ 0 };
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
+      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+      for( int i = 0; i < processConfig::ndiagrams; ++i )
+      {
+        numerators_sv[i] = fptype_sv{ 0 };
+      }
+      denominators_sv = fptype_sv{ 0 };
+#endif
+    }
+#endif
+
+    // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
+    // (in both CUDA and C++, using precomputed good helicities)
+
+#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
+
+    // *** START OF PART 1a - CUDA (one event per GPU thread) ***
+
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+    {
+      const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
+#else
+      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, nevt );
+#endif
+    }
+    // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
+
+    // Event-by-event random choice of color and diagram #402
+    gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, helcolDenominators[0] );
+#endif
+    // *** END OF PART 1a - CUDA (one event per GPU thread) ***
+
+#else // CUDA OR C++
+
+    // *** START OF PART 1b - C++ (loop on event pages)
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed fptypes #537: float for color algebra and double elsewhere
+    // Delay color algebra and ME updates (only on even pages)
+    assert( npagV %% 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
+#else
+    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+#endif
+#ifdef _OPENMP
+    // OMP multithreading #575 (NB: tested only with gcc11 so far)
+    // See https://www.openmp.org/specifications/
+    // - default(none): no variables are shared by default
+    // - shared: as the name says
+    // - private: give each thread its own copy, without initialising
+    // - firstprivate: give each thread its own copy, and initialise with value from outside
+#define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig
+#else
+#define _OMPLIST1
+#endif
+#pragma omp parallel for default( none ) shared( _OMPLIST0 _OMPLIST1 )
+#undef _OMPLIST0
+#undef _OMPLIST1
+#endif // _OPENMP
+    for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
+    {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
+#else
+      const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
+#endif
+      // Running sum of partial amplitudes squared for event by event color selection (#402)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+#endif
+      for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+      {
+        const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
+        calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, ievt00 );
+#endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
+        MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
+#endif
+      }
+      // Event-by-event random choice of helicity #403
+      for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      {
+        const int ievt = ievt00 + ieppV;
+        //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] );
+        for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+        {
+#if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
+          const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
+#else
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel] );
+          const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
+#endif
+          if( okhel )
+          {
+            const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+            allselhel[ievt] = ihelF;
+            //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
+            break;
+          }
+        }
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        const int ievt2 = ievt00 + ieppV + neppV;
+        //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt2, allrndhel[ievt2] );
+        for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+        {
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
+          if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
+          {
+            const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+            allselhel[ievt2] = ihelF;
+            //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt2, ihelF );
+            break;
+          }
+        }
+#endif
+      }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      const int vecsize = 2 * neppV;
+#else
+      const int vecsize = neppV;
+#endif
+      unsigned int channelIdVec[vecsize];
+      if( allChannelIds != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          channelIdVec[ieppV] = allChannelIds[ievt];
+        }
+      }
+
+      // Event-by-event random choice of channel
+      if( allrnddiagram != nullptr )
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          fptype numerator_sum = 0., normalization = 0.;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV %% neppV];
+          }
+          channelIdVec[ieppV] = mgOnGpu::nchannels;
+          for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ )
+          {
+            if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue;
+            numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams +
+                                           ichan * neppV + ieppV %% neppV];
+            if( allrnddiagram[ievt] < numerator_sum / normalization )
+            {
+              channelIdVec[ieppV] = ichan + 1;
+              break;
+            }
+          }
+          allDiagramIdsOut[ievt] = channelIdVec[ieppV];
+        }
+      }
+
+      // Event-by-event random choice of color #402
+      if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+      {
+        for( int ieppV = 0; ieppV < vecsize; ++ieppV )
+        {
+          unsigned int channelId = channelIdVec[ieppV];
+          if( channelId > mgOnGpu::nchannels )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
+            assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+          }
+          // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+          // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+          const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+          if( iconfig <= 0 )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
+            assert( iconfig > 0 ); // SANITY CHECK #917
+          }
+          else if( iconfig > (int)mgOnGpu::nconfigSDE )
+          {
+            printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+            assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+          }
+          fptype targetamp[ncolor] = { 0 };
+          // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( icolC == 0 )
+              targetamp[icolC] = 0;
+            else
+              targetamp[icolC] = targetamp[icolC - 1];
+#ifdef MGONGPU_CPPSIMD
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV %% neppV];
+#else
+            if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] +=
+              jamp2_sv[icolC + ncolor * ( ieppV / neppV )];
+#endif
+          }
+          const int ievt = ievt00 + ieppV;
+          //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
+          for( int icolC = 0; icolC < ncolor; icolC++ )
+          {
+            if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+            {
+              allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+              //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
+              break;
+            }
+          }
+        }
+      }
+      else
+      {
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          allselcol[ievt2] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+#endif
+        }
+      }
+#endif // multichannel enabled (random color choice)
+    }
+    // *** END OF PART 1b - C++ (loop on event pages)
+
+#endif // CUDA or C++
+
+    // PART 2 - FINALISATION (after calculate_jamps)
+    // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
+    // [NB 'sum over final spins, average over initial spins', eg see
+    // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
+#ifndef MGONGPUCPP_GPUIMPL
+    for( int ipagV = 0; ipagV < npagV; ++ipagV )
+    {
+      const int ievt0 = ipagV * neppV;
+      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+      MEs_sv /= (helcolDenominators[0] * broken_symmetry_factor(iflavorVec[ievt0]));
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+      {
+        const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
+        fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+        fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
+        fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
+      }
+#endif
+      //for( int ieppV = 0; ieppV < neppV; ieppV++ )
+      //{
+      //  const unsigned int ievt = ipagV * neppV + ieppV;
+      //  printf( "sigmaKin: ievt=%%2d me=%%f\n", ievt, allMEs[ievt] );
+      //}
+    }
+#endif
+    mgDebugFinalise();
+  }
+
+  //--------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/profile.sh b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/profile.sh
new file mode 100644
index 0000000000..7644ea9e83
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/profile.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+usage(){
+  echo "Usage (GUI analysis): $0 -l label [-cc] [-p #blocks #threads #iterations]"
+  echo "Usage (CL analysis):  $0 -nogui [-p #blocks #threads #iterations]"
+  exit 1
+}
+
+# Default options
+tag=cu
+###cuargs="16384 32 12" # NEW DEFAULT 2020.08.10 (faster on local, and allows comparison to global and shared memory)
+###ccargs="  256 32 12" # Similar to cuda config, but faster than using "16384 32 12"
+##cuargs="16384 32 2" # faster tests
+##ccargs="  256 32 2" # faster tests
+cuargs="2048 256 1" # NEW DEFAULT 2021.04.06 (matches "-p 2048 256 12" but only one iteration)
+ccargs="2048 256 1" # NEW DEFAULT 2021.04.06 (matches "-p 2048 256 12" but only one iteration)
+args=
+label=
+
+# Command line arguments
+while [ "$1" != "" ]; do
+  # Profile C++ instead of cuda
+  if [ "$1" == "-cc" ]; then
+    if [ "$tag" != "nogui" ]; then
+      tag=cc
+      shift
+    else
+      echo "ERROR! Incompatible options -gui and -cc"
+      usage
+    fi
+  # Fast no-GUI profiling with ncu
+  elif [ "$1" == "-nogui" ]; then
+    if [ "$tag" != "cc" ]; then
+      tag=nogui
+      shift
+    else
+      echo "ERROR! Incompatible options -gui and -cc"
+      usage
+    fi
+  # Override blocks/threads/iterations
+  # (NB do not exceed 12 iterations: profiling overhead per iteration is huge)
+  elif [ "$1" == "-p" ]; then
+    if [ "$4" != "" ]; then
+      args="$2 $3 $4"    
+      shift 4
+    else
+      usage
+    fi
+  # Label
+  elif [ "$1" == "-l" ]; then
+    if [ "$2" != "" ]; then
+      label="$2"
+      shift 2
+    else
+      usage
+    fi
+  # Invalid arguments
+  else
+    usage
+  fi
+done
+
+if [ "$tag" == "cc" ]; then
+  if [ "$args" == "" ]; then args=$ccargs; fi
+  cmd="./check.exe -p $args"
+  make
+else
+  if [ "$args" == "" ]; then args=$cuargs; fi
+  cmd="./gcheck.exe -p $args"
+  make
+fi
+
+ncu="ncu"
+nsys="nsys"
+ncugui="ncu-ui &"
+nsysgui="nsight-sys &"
+
+# Settings specific to CERN condor/batch nodes
+###host=$(hostname)
+###if [ "${host%%cern.ch}" != "${host}" ] && [ "${host##b}" != "${host}" ]; then
+###  ncu=/usr/local/cuda-11.0/bin/ncu
+###  ###nsys=/usr/local/cuda-10.1/bin/nsys
+###  ###nsys=/usr/local/cuda-10.2/bin/nsys
+###  nsys=/cvmfs/sft.cern.ch/lcg/releases/cuda/11.0RC-d9c38/x86_64-centos7-gcc62-opt/bin/nsys
+###  ncugui="Launch the Nsight Compute GUI from Windows"
+###  nsysgui="Launch the Nsight System GUI from Windows"
+###fi
+
+# Settings specific to CERN IT/SC nodes
+# (nsys 11.4 and 11.5 fail with 'boost::wrapexcept<QuadDCommon::NotFoundException>')
+host=$(hostname)
+if [ "${host%%cern.ch}" != "${host}" ] && [ "${host##itsc}" != "${host}" ]; then
+  CUDA_NSIGHT_HOME=/usr/local/cuda-11.1
+  echo "Using Nsight from ${CUDA_NSIGHT_HOME}"
+  ncu=${CUDA_NSIGHT_HOME}/bin/ncu
+  nsys=${CUDA_NSIGHT_HOME}/bin/nsys
+  ncugui="${CUDA_NSIGHT_HOME}/bin/ncu-ui &"
+  nsysgui="${CUDA_NSIGHT_HOME}/bin/nsight-sys &"
+fi
+
+# Set the ncu sampling period (default is auto)
+# The value is in the range [0..31], the actual period is 2**(5+value) cycles. 
+###ncu="${ncu} --sampling-interval 0"  # MAX sampling frequency
+###ncu="${ncu} --sampling-interval 31" # MIN sampling frequency
+
+# METRICS FOR COALESCED MEMORY ACCESS (AOSOA etc)
+# See https://developer.nvidia.com/blog/using-nsight-compute-to-inspect-your-kernels/
+# These used to be called gld_transactions and global_load_requests
+# See also https://docs.nvidia.com/nsight-compute/2019.5/NsightComputeCli/index.html#nvprof-metric-comparison
+# See also https://stackoverflow.com/questions/60535867
+metrics=l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum
+
+# METRICS FOR REGISTER PRESSURE
+metrics+=,launch__registers_per_thread
+
+# METRICS FOR DIVERGENCE
+metrics+=,sm__sass_average_branch_targets_threads_uniform.pct
+
+# GUI analysis
+if [ "$tag" != "nogui" ]; then
+
+  if [ "$label" == "" ]; then
+    echo "ERROR! You must specify a label"
+    usage
+  fi
+
+  arg1=$(echo $args | cut -d' ' -f1)
+  arg2=$(echo $args | cut -d' ' -f2)
+  arg3=$(echo $args | cut -d' ' -f3)
+  
+  ###if [ "${host%%raplab*}" != "${host}" ]; then
+  ###  logs=nsight_logs_raplab
+  ###elif [ "${host%%cern.ch}" != "${host}" ] && [ "${host##b}" != "${host}" ]; then
+  ###  logs=nsight_logs_lxbatch
+  ###else
+  ###  logs=nsight_logs
+  ###fi
+  logs=nsight_logs
+
+  if [ ! -d $logs ]; then mkdir -p $logs; fi
+  trace=$logs/Sigma_sm_gg_ttxgg_${tag}_`date +%m%d_%H%M`_b${arg1}_t${arg2}_i${arg3}
+  if [ "$label" != "" ]; then trace=${trace}_${label}; fi
+  
+  echo
+  echo "PROFILING: ${cmd}"
+  echo "OUTPUT: ${trace}.*"
+  echo
+  
+  \rm -f ${trace}.*
+  
+  hostname > ${trace}.txt
+  echo "nproc=$(nproc)" >> ${trace}.txt
+  echo >> ${trace}.txt
+  ( time ${cmd} ) 2>&1 | tee -a ${trace}.txt
+  nvidia-smi -q -d CLOCK >> ${trace}.txt
+  
+  if [ "$tag" == "cu" ]; then
+    echo
+    echo "${ncu} --set full --metrics ${metrics} -o ${trace} ${cmd}"
+    echo
+    ${ncu} --set full --metrics ${metrics} -o ${trace} ${cmd}
+  fi
+  echo
+  echo "${nsys} profile -o ${trace} ${cmd}"
+  echo
+  ${nsys} profile -o ${trace} ${cmd}
+  echo ""
+  echo "TO ANALYSE TRACE FILES:"
+  echo "  ${ncugui}"
+  echo "  ${nsysgui}"
+  
+# NO-GUI analysis
+else
+
+  echo
+  echo "PROFILING: ${cmd}"
+  echo "${ncu} --metrics ${metrics} ${cmd}"
+  echo
+  echo sudo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} $(which ${ncu}) --metrics ${metrics}  --target-processes all ${cmd}
+  sudo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} $(which ${ncu}) --metrics ${metrics}  --target-processes all ${cmd}
+
+fi
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
new file mode 100644
index 0000000000..b430928a92
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h
@@ -0,0 +1,191 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuFptypes.h"
+
+#include "CPPProcess.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+
+// Simplified rambo version for 2 to N (with N>=2) processes with massless particles
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int np4 = CPPProcess::np4;     // dimensions of 4-momenta (E,px,py,pz)
+  constexpr int npari = CPPProcess::npari; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu-
+  constexpr int nparf = CPPProcess::nparf; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu-
+  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+
+  //--------------------------------------------------------------------------
+
+  // Fill in the momenta of the initial particles
+  // [NB: the output buffer includes both initial and final momenta, but only initial momenta are filled in]
+  template<class M_ACCESS>
+  __host__ __device__ void
+  ramboGetMomentaInitial( const fptype energy, // input: energy
+                          fptype* momenta )    // output: momenta for one event or for a set of events
+  {
+    const fptype energy1 = energy / 2;
+    const fptype energy2 = energy / 2;
+    const fptype mom = energy / 2;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 0, 0 ) = energy1;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 1, 0 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 2, 0 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 3, 0 ) = mom;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 0, 1 ) = energy2;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 1, 1 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 2, 1 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 3, 1 ) = -mom;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Fill in the momenta of the final particles using the RAMBO algorithm
+  // [NB: the output buffer includes both initial and final momenta, but only initial momenta are filled in]
+  template<class R_ACCESS, class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ramboGetMomentaFinal( const fptype energy,  // input: energy
+                        const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events
+                        fptype* momenta,      // output: momenta for one event or for a set of events
+                        fptype* wgts )        // output: weights for one event or for a set of events
+  {
+    /****************************************************************************
+     *                       rambo                                              *
+     *    ra(ndom)  m(omenta)  b(eautifully)  o(rganized)                       *
+     *                                                                          *
+     *    a democratic multi-particle phase space generator                     *
+     *    authors:  s.d. ellis,  r. kleiss,  w.j. stirling                      *
+     *    this is version 1.0 -  written by r. kleiss                           *
+     *    -- adjusted by hans kuijf, weights are logarithmic (1990-08-20)       *
+     *    -- adjusted by madgraph@sheffield_gpu_hackathon team (2020-07-29)     *
+     *                                                                          *
+     ****************************************************************************/
+
+    // output weight
+    fptype& wt = W_ACCESS::kernelAccess( wgts );
+
+    // AV special case nparf==1 (issue #358)
+    if constexpr( nparf == 1 )
+    {
+      static bool first = true;
+      if( first )
+      {
+#ifdef MGONGPUCPP_GPUIMPL
+        if constexpr( M_ACCESS::isOnDevice() ) // avoid
+        {
+          const int ievt0 = 0;
+          const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+          if( ievt == ievt0 )
+            printf( "WARNING! Rambo called with 1 final particle: random numbers will be ignored\n" );
+        }
+        else
+#endif
+        {
+          printf( "WARNING! Rambo called with 1 final particle: random numbers will be ignored\n" );
+        }
+        first = false;
+      }
+      const int iparf = 0;
+      for( int i4 = 0; i4 < np4; i4++ )
+      {
+        M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = 0;
+        for( int ipari = 0; ipari < npari; ipari++ )
+        {
+          M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) += M_ACCESS::kernelAccessIp4Ipar( momenta, i4, ipari );
+        }
+      }
+      wt = 1;
+      return;
+    }
+
+    // initialization step: factorials for the phase space weight
+    const fptype twopi = 8. * atan( 1. );
+    const fptype po2log = log( twopi / 4. );
+    fptype z[nparf];
+    if constexpr( nparf > 1 ) // avoid build warning on clang (related to #358)
+      z[1] = po2log;
+    for( int kpar = 2; kpar < nparf; kpar++ ) z[kpar] = z[kpar - 1] + po2log - 2. * log( fptype( kpar - 1 ) );
+    for( int kpar = 2; kpar < nparf; kpar++ ) z[kpar] = ( z[kpar] - log( fptype( kpar ) ) );
+
+    // generate n massless momenta in infinite phase space
+    fptype q[nparf][np4];
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      const fptype r1 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 0, iparf );
+      const fptype r2 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 1, iparf );
+      const fptype r3 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 2, iparf );
+      const fptype r4 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 3, iparf );
+      const fptype c = 2. * r1 - 1.;
+      const fptype s = sqrt( 1. - c * c );
+      const fptype f = twopi * r2;
+      q[iparf][0] = -log( r3 * r4 );
+      q[iparf][3] = q[iparf][0] * c;
+      q[iparf][2] = q[iparf][0] * s * cos( f );
+      q[iparf][1] = q[iparf][0] * s * sin( f );
+    }
+
+    // calculate the parameters of the conformal transformation
+    fptype r[np4];
+    fptype b[np4 - 1];
+    for( int i4 = 0; i4 < np4; i4++ ) r[i4] = 0.;
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      for( int i4 = 0; i4 < np4; i4++ ) r[i4] = r[i4] + q[iparf][i4];
+    }
+    const fptype rmas = sqrt( pow( r[0], 2 ) - pow( r[3], 2 ) - pow( r[2], 2 ) - pow( r[1], 2 ) );
+    for( int i4 = 1; i4 < np4; i4++ ) b[i4 - 1] = -r[i4] / rmas;
+    const fptype g = r[0] / rmas;
+    const fptype a = 1. / ( 1. + g );
+    const fptype x0 = energy / rmas;
+
+    // transform the q's conformally into the p's (i.e. the 'momenta')
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      fptype bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3];
+      for( int i4 = 1; i4 < np4; i4++ )
+      {
+        M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x0 * ( q[iparf][i4] + b[i4 - 1] * ( q[iparf][0] + a * bq ) );
+      }
+      M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = x0 * ( g * q[iparf][0] + bq );
+    }
+
+    // calculate weight (NB return log of weight)
+    wt = po2log;
+    if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
+
+#ifndef MGONGPUCPP_GPUIMPL
+    // issue warnings if weight is too small or too large
+    static int iwarn[5] = { 0, 0, 0, 0, 0 };
+    if( wt < -180. )
+    {
+      if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl;
+      iwarn[0] = iwarn[0] + 1;
+    }
+    if( wt > 174. )
+    {
+      if( iwarn[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl;
+      iwarn[1] = iwarn[1] + 1;
+    }
+#endif
+
+    // return for weighted massless momenta
+    // nothing else to do in this event if all particles are massless (nm==0)
+
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
new file mode 100644
index 0000000000..678eb8c34e
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -0,0 +1,449 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+//----------------------------------------------------------------------------
+// Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
+//----------------------------------------------------------------------------
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "MadgraphTest.h"
+#include "MatrixElementKernels.h"
+#include "MemoryAccessChannelIds.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+#include "RamboSamplingKernels.h"
+#include "RandomNumberKernels.h"
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#include "coloramps.h"
+#endif
+#include "epoch_process_id.h"
+
+#include <memory>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+struct CUDA_CPU_TestBase : public TestDriverBase
+{
+  static constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+  static constexpr int np4 = CPPProcess::np4;
+  static constexpr int npar = CPPProcess::npar;
+  static_assert( gputhreads % neppM == 0, "ERROR! #threads/block should be a multiple of neppM" );
+  static_assert( gputhreads <= mgOnGpu::ntpbMAX, "ERROR! #threads/block should be <= ntpbMAX" );
+  CUDA_CPU_TestBase( const std::string& refFileName )
+    : TestDriverBase( npar, refFileName ) {}
+  // Does this test use channelIds?
+  virtual bool useChannelIds() const = 0;
+  // Set channelId array (in the same way for CUDA and CPU tests)
+  static constexpr unsigned int warpSize = 32; // FIXME: add a sanity check in madevent that this is the minimum? (would need to expose this from cudacpp to madevent)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  static void setChannelIds( BufferChannelIds& hstChannelIds, std::size_t iiter )
+  {
+    static const char* debugC = getenv( "CUDACPP_RUNTEST_DEBUG" );
+    static const bool debug = ( debugC != 0 ) && ( std::string( debugC ) != "" );
+    // Fill channelIds for multi-channel tests #896
+    // (NB: these are only used if useChannelIds == true)
+    // TEMPORARY(0): debug multichannel tests with channelId=1 for all events
+    //for( unsigned int i = 0; i < nevt; ++i ) hstChannelIds[i] = 1;
+    // TEMPORARY(1): debug multichannel tests with channelId=1,2,..,ndiag,1,2,..ndiag,... (every event gets a different channel, no warps)
+    //for( unsigned int i = 0; i < nevt; ++i ) hstChannelIds[i] = 1 + i % CPPProcess::ndiagrams;
+    // ALMOST FINAL test implementation: 1111222233331111... (every 32-event warp gets a different channel)
+    // FINAL(?) test implementation: 2222333344442222... (every 32-event warp gets a different channel, skip those without associated iconfig #917)
+    static_assert( nevt % warpSize == 0, "ERROR! nevt should be a multiple of warpSize" );
+    constexpr unsigned int nWarp = nevt / warpSize;
+    for( unsigned int iWarp = 0; iWarp < nWarp; ++iWarp )
+    {
+      //const unsigned int channelId = 1 + ( iWarp + iiter * nWarp ) % CPPProcess::ndiagrams; // bug #917
+      const int iconfig = 1 + ( iWarp + iiter * nWarp ) % mgOnGpu::nconfigSDE;
+      unsigned int channelId = 0;
+      //for( unsigned int idiagram = 1; idiagram < CPPProcess::ndiagrams; idiagram++ ) // two bugs #920 and #919
+      for( unsigned int idiagram = 0; idiagram < mgOnGpu::nchannels; idiagram++ ) // fix #920 and work around #919
+      {
+        if( mgOnGpu::hostChannel2iconfig[idiagram] == iconfig )
+        {
+          channelId = idiagram + 1; // fix #917 (NB add +1 because channelId uses F indexing)
+          break;
+        }
+      }
+      assert( channelId > 0 ); // sanity check that the channelId for the given iconfig was found
+      if( debug ) std::cout << "CUDA_CPU_TestBase::setChannelIds: iWarp=" << iWarp << ", iconfig=" << iconfig << ", channelId=" << channelId << std::endl;
+      for( unsigned int i = 0; i < warpSize; ++i )
+        hstChannelIds[iWarp * warpSize + i] = channelId;
+    }
+  }
+#else
+  static void setChannelIds( BufferChannelIds& hstChannelIds, std::size_t /*iiter*/ )
+  {
+    // No-multichannel tests (set a DUMMY channelId=0 for all events: this is not used for ME comparison, but it does enter the comparison to reference results #976)
+    for( unsigned int i = 0; i < nevt; ++i ) hstChannelIds[i] = 0;
+  }
+#endif
+};
+
+#ifndef MGONGPUCPP_GPUIMPL
+struct CPUTest : public CUDA_CPU_TestBase
+{
+  // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
+  // [NB the hst/dev memory arrays must be initialised in the constructor, see issue #290]
+  CPPProcess process;
+  HostBufferRndNumMomenta hstRndMom;
+  HostBufferChannelIds hstChannelIds;
+  HostBufferMomenta hstMomenta;
+  HostBufferGs hstGs;
+  HostBufferRndNumHelicity hstRndHel;
+  HostBufferRndNumColor hstRndCol;
+  HostBufferWeights hstWeights;
+  HostBufferMatrixElements hstMatrixElements;
+  HostBufferSelectedHelicity hstSelHel;
+  HostBufferSelectedColor hstSelCol;
+  HostBufferHelicityMask hstIsGoodHel;
+  std::unique_ptr<MatrixElementKernelBase> pmek;
+
+  // Create a process object
+  // Read param_card and set parameters
+  // ** WARNING EVIL EVIL **
+  // The CPPProcess constructor has side effects on the globals Proc::cHel, which is needed in ME calculations.
+  // Don't remove!
+  CPUTest( const std::string& refFileName )
+    : CUDA_CPU_TestBase( refFileName )
+    , process( /*verbose=*/false )
+    , hstRndMom( nevt )
+    , hstChannelIds( nevt )
+    , hstMomenta( nevt )
+    , hstGs( nevt )
+    , hstRndHel( nevt )
+    , hstRndCol( nevt )
+    , hstWeights( nevt )
+    , hstMatrixElements( nevt )
+    , hstSelHel( nevt )
+    , hstSelCol( nevt )
+    , hstIsGoodHel( CPPProcess::ncomb )
+    , pmek( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstChannelIds, hstMatrixElements, hstSelHel, hstSelCol, nevt ) )
+  {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
+    process.initProc( "../../Cards/param_card.dat" );
+  }
+
+  virtual ~CPUTest() {}
+
+  void prepareRandomNumbers( unsigned int iiter ) override
+  {
+    // Random numbers for momenta
+    CommonRandomNumberKernel rnk( hstRndMom );
+    rnk.seedGenerator( 1337 + iiter );
+    rnk.generateRnarray();
+    // Random numbers for helicity and color selection (fix #931)
+    CommonRandomNumberKernel rnk2( hstRndHel );
+    rnk2.seedGenerator( 1338 + iiter );
+    rnk2.generateRnarray();
+    CommonRandomNumberKernel rnk3( hstRndCol );
+    rnk3.seedGenerator( 1339 + iiter );
+    rnk3.generateRnarray();
+  }
+
+  void prepareMomenta( fptype energy ) override
+  {
+    RamboSamplingKernelHost rsk( energy, hstRndMom, hstMomenta, hstWeights, nevt );
+    // --- 2a. Fill in momenta of initial state particles on the device
+    rsk.getMomentaInitial();
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    rsk.getMomentaFinal();
+  }
+
+  void runSigmaKin( std::size_t iiter ) override
+  {
+    constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+    for( unsigned int i = 0; i < nevt; ++i ) hstGs[i] = fixedG;
+    setChannelIds( hstChannelIds, iiter ); // fill channelIds for multi-channel tests #896
+    if( iiter == 0 ) pmek->computeGoodHelicities();
+    pmek->computeMatrixElements( useChannelIds() );
+  }
+
+  fptype getMomentum( std::size_t ievt, unsigned int ipar, unsigned int ip4 ) const override
+  {
+    assert( ipar < npar );
+    assert( ip4 < np4 );
+    return MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, ip4, ipar );
+  }
+
+  fptype getMatrixElement( std::size_t ievt ) const override
+  {
+    return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt );
+  }
+
+  int getChannelId( std::size_t ievt ) const override
+  {
+    return MemoryAccessChannelIds::ieventAccessConst( hstChannelIds.data(), ievt );
+  }
+
+  int getSelectedHelicity( std::size_t ievt ) const override
+  {
+    //return MemoryAccessSelectedHelicity::ieventAccessConst( hstSelHel.data(), ievt ); // does not exist yet...
+    return hstSelHel.data()[ievt];
+  }
+
+  int getSelectedColor( std::size_t ievt ) const override
+  {
+    //return MemoryAccessSelectedColor::ieventAccessConst( hstSelCol.data(), ievt ); // does not exist yet...
+    return hstSelCol.data()[ievt];
+  }
+};
+
+// Old test with multi-channel disabled #466
+struct CPUTestNoMultiChannel : public CPUTest
+{
+  // Does this test use channelIds?
+  bool useChannelIds() const override final { return false; }
+
+  // Constructor
+  CPUTestNoMultiChannel( const std::string& refFileName )
+    : CPUTest( refFileName ) {} // suffix .txt
+
+  // Destructor
+  virtual ~CPUTestNoMultiChannel() {}
+};
+
+// New test with multi-channel enabled #896
+struct CPUTestMultiChannel : public CPUTest
+{
+  // Does this test use channelIds?
+  bool useChannelIds() const override final { return true; }
+
+  // Constructor
+  CPUTestMultiChannel( const std::string& refFileName )
+    : CPUTest( refFileName + "2" ) {} // suffix .txt2
+
+  // Destructor
+  virtual ~CPUTestMultiChannel() {}
+};
+#endif
+
+#ifdef MGONGPUCPP_GPUIMPL
+struct CUDATest : public CUDA_CPU_TestBase
+{
+  // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
+  // [NB the hst/dev memory arrays must be initialised in the constructor, see issue #290]
+  CPPProcess process;
+  PinnedHostBufferRndNumMomenta hstRndMom;
+  PinnedHostBufferMomenta hstMomenta;
+  PinnedHostBufferGs hstGs;
+  PinnedHostBufferRndNumHelicity hstRndHel;
+  PinnedHostBufferRndNumColor hstRndCol;
+  PinnedHostBufferWeights hstWeights;
+  PinnedHostBufferChannelIds hstChannelIds;
+  PinnedHostBufferMatrixElements hstMatrixElements;
+  PinnedHostBufferSelectedHelicity hstSelHel;
+  PinnedHostBufferSelectedColor hstSelCol;
+  PinnedHostBufferHelicityMask hstIsGoodHel;
+  DeviceBufferRndNumMomenta devRndMom;
+  DeviceBufferChannelIds devChannelIds;
+  DeviceBufferMomenta devMomenta;
+  DeviceBufferGs devGs;
+  DeviceBufferRndNumHelicity devRndHel;
+  DeviceBufferRndNumColor devRndCol;
+  DeviceBufferWeights devWeights;
+  DeviceBufferMatrixElements devMatrixElements;
+  DeviceBufferSelectedHelicity devSelHel;
+  DeviceBufferSelectedColor devSelCol;
+  DeviceBufferHelicityMask devIsGoodHel;
+  std::unique_ptr<MatrixElementKernelBase> pmek;
+
+  // Create a process object
+  // Read param_card and set parameters
+  // ** WARNING EVIL EVIL **
+  // The CPPProcess constructor has side effects on the globals Proc::cHel, which is needed in ME calculations.
+  // Don't remove!
+  CUDATest( const std::string& refFileName )
+    : CUDA_CPU_TestBase( refFileName )
+    , process( /*verbose=*/false )
+    , hstRndMom( nevt )
+    , hstChannelIds( nevt )
+    , hstMomenta( nevt )
+    , hstGs( nevt )
+    , hstRndHel( nevt )
+    , hstRndCol( nevt )
+    , hstWeights( nevt )
+    , hstMatrixElements( nevt )
+    , hstSelHel( nevt )
+    , hstSelCol( nevt )
+    , hstIsGoodHel( CPPProcess::ncomb )
+    , devRndMom( nevt )
+    , devChannelIds( nevt )
+    , devMomenta( nevt )
+    , devGs( nevt )
+    , devRndHel( nevt )
+    , devRndCol( nevt )
+    , devWeights( nevt )
+    , devMatrixElements( nevt )
+    , devSelHel( nevt )
+    , devSelCol( nevt )
+    , devIsGoodHel( CPPProcess::ncomb )
+    , pmek( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devChannelIds, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) )
+  {
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton?
+    process.initProc( "../../Cards/param_card.dat" );
+  }
+
+  virtual ~CUDATest() {}
+
+  void prepareRandomNumbers( unsigned int iiter ) override
+  {
+    // Random numbers for momenta
+    CommonRandomNumberKernel rnk( hstRndMom );
+    rnk.seedGenerator( 1337 + iiter );
+    rnk.generateRnarray();
+    copyDeviceFromHost( devRndMom, hstRndMom );
+    // Random numbers for helicity and color selection (fix #931)
+    CommonRandomNumberKernel rnk2( hstRndHel );
+    rnk2.seedGenerator( 1338 + iiter );
+    rnk2.generateRnarray();
+    copyDeviceFromHost( devRndHel, hstRndHel );
+    CommonRandomNumberKernel rnk3( hstRndCol );
+    rnk3.seedGenerator( 1339 + iiter );
+    rnk3.generateRnarray();
+    copyDeviceFromHost( devRndCol, hstRndCol );
+  }
+
+  void prepareMomenta( fptype energy ) override
+  {
+    RamboSamplingKernelDevice rsk( energy, devRndMom, devMomenta, devWeights, gpublocks, gputhreads );
+    // --- 2a. Fill in momenta of initial state particles on the device
+    rsk.getMomentaInitial();
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    rsk.getMomentaFinal();
+    // --- 2c. CopyDToH Weights
+    copyHostFromDevice( hstWeights, devWeights );
+    // --- 2d. CopyDToH Momenta
+    copyHostFromDevice( hstMomenta, devMomenta );
+  }
+
+  void runSigmaKin( std::size_t iiter ) override
+  {
+    constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+    for( unsigned int i = 0; i < nevt; ++i ) hstGs[i] = fixedG;
+    copyDeviceFromHost( devGs, hstGs );    // BUG FIX #566
+    setChannelIds( hstChannelIds, iiter ); // fill channelIds for multi-channel tests #896
+    copyDeviceFromHost( devChannelIds, hstChannelIds );
+    if( iiter == 0 ) pmek->computeGoodHelicities();
+    pmek->computeMatrixElements( useChannelIds() );
+    copyHostFromDevice( hstMatrixElements, devMatrixElements );
+    copyHostFromDevice( hstSelHel, devSelHel );
+    copyHostFromDevice( hstSelCol, devSelCol );
+  }
+
+  fptype getMomentum( std::size_t ievt, unsigned int ipar, unsigned int ip4 ) const override
+  {
+    assert( ipar < npar );
+    assert( ip4 < np4 );
+    return MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, ip4, ipar );
+  }
+
+  fptype getMatrixElement( std::size_t ievt ) const override
+  {
+    return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt );
+  }
+
+  int getChannelId( std::size_t ievt ) const override
+  {
+    return MemoryAccessChannelIds::ieventAccessConst( hstChannelIds.data(), ievt );
+  }
+
+  int getSelectedHelicity( std::size_t ievt ) const override
+  {
+    //return MemoryAccessSelectedHelicity::ieventAccessConst( hstSelHel.data(), ievt ); // does not exist yet...
+    return hstSelHel.data()[ievt];
+  }
+
+  int getSelectedColor( std::size_t ievt ) const override
+  {
+    //return MemoryAccessSelectedColor::ieventAccessConst( hstSelCol.data(), ievt ); // does not exist yet...
+    return hstSelCol.data()[ievt];
+  }
+};
+
+// Old test with multi-channel disabled #466
+struct CUDATestNoMultiChannel : public CUDATest
+{
+  // Does this test use channelIds?
+  bool useChannelIds() const override final { return false; }
+
+  // Constructor
+  CUDATestNoMultiChannel( const std::string& refFileName )
+    : CUDATest( refFileName ) {} // suffix .txt
+
+  // Destructor
+  virtual ~CUDATestNoMultiChannel() {}
+};
+
+// New test with multi-channel enabled #896
+struct CUDATestMultiChannel : public CUDATest
+{
+  // Does this test use channelIds?
+  bool useChannelIds() const override final { return true; }
+
+  // Constructor
+  CUDATestMultiChannel( const std::string& refFileName )
+    : CUDATest( refFileName + "2" ) {} // suffix .txt2
+
+  // Destructor
+  virtual ~CUDATestMultiChannel() {}
+};
+#endif /* clang-format off */
+
+// AV July 2024 much simpler class structure without the presently-unnecessary googletest templates
+// This is meant as a workaround to prevent not-understood segfault #907 when adding a second test
+// Note: instantiate test2 first and test1 second to ensure that the channelid printout from the dtors comes from test1 first and test2 second
+#ifdef MGONGPUCPP_GPUIMPL
+// CUDA test drivers
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+CUDATestMultiChannel driver2( MG_EPOCH_REFERENCE_FILE_NAME );
+#define TESTID2( s ) s##_GPU_MULTICHANNEL
+#endif
+CUDATestNoMultiChannel driver1( MG_EPOCH_REFERENCE_FILE_NAME );
+#define TESTID1( s ) s##_GPU_NOMULTICHANNEL
+#else
+// CPU test drivers
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+CPUTestMultiChannel driver2( MG_EPOCH_REFERENCE_FILE_NAME );
+#define TESTID2( s ) s##_CPU_MULTICHANNEL
+#endif
+CPUTestNoMultiChannel driver1( MG_EPOCH_REFERENCE_FILE_NAME );
+#define TESTID1( s ) s##_CPU_NOMULTICHANNEL
+#endif
+// Madgraph tests
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+MadgraphTest mgTest2( driver2 );
+#endif
+MadgraphTest mgTest1( driver1 );
+// Instantiate Google test 1
+#define XTESTID1( s ) TESTID1( s )
+TEST( XTESTID1( MG_EPOCH_PROCESS_ID ), compareMomAndME )
+{
+#ifdef MGONGPU_CHANNELID_DEBUG
+  driver1.pmek->setTagForNevtProcessedByChannel( "(no multichannel)" );
+#endif
+  mgTest1.CompareMomentaAndME( *this );
+}
+// Instantiate Google test 2
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define XTESTID2( s ) TESTID2( s )
+TEST( XTESTID2( MG_EPOCH_PROCESS_ID ), compareMomAndME )
+{
+#ifdef MGONGPU_CHANNELID_DEBUG
+  driver2.pmek->setTagForNevtProcessedByChannel( "(channelid array)" );
+#endif
+  mgTest2.CompareMomentaAndME( *this );
+}
+#endif
+/* clang-format on */
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f
new file mode 100644
index 0000000000..63d69bfde6
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/smatrix_multi.f
@@ -0,0 +1,99 @@
+c Copyright (C) 2020-2024 CERN and UCLouvain.
+c Licensed under the GNU Lesser General Public License (version 3 or later).
+c Created by: O. Mattelaer (Oct 2023) for the MG5aMC CUDACPP plugin.
+c Further modified by: O. Mattelaer, A. Valassi (2023-2024) for the MG5aMC CUDACPP plugin.
+c       ======================================================
+c       *START* Included from CUDACPP template smatrix_multi.f
+c       (into function smatrix$i_multi in auto_dsig$i.f)
+c       ======================================================
+        CALL COUNTERS_SMATRIX1MULTI_STOP( -1 )  ! fortranMEs=-1
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      ENDIF
+
+      IF( FBRIDGE_MODE .EQ. 1 .OR. FBRIDGE_MODE .LT. 0 ) THEN  ! (CppOnly=1 or BothQuiet=-1 or BothDebug=-2)
+        IF( LIMHEL.NE.0 ) THEN
+          WRITE(6,*) 'ERROR! The cudacpp bridge only supports LIMHEL=0'
+          STOP
+        ENDIF
+        IF ( FIRST ) THEN  ! exclude first pass (helicity filtering) from timers (#461)
+          CALL COUNTERS_SMATRIX1MULTI_START( 1, VECSIZE_USED )  ! cudacppHEL=1
+          CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE,  ! multi channel disabled for helicity filtering
+     &      P_MULTI, ALL_G, IFLAV_VEC, HEL_RAND, COL_RAND,
+     &      OUT2, SELECTED_HEL2, SELECTED_COL2, .TRUE.)  ! quit after computing helicities
+          FIRST = .FALSE.
+c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
+c          IF( FBRIDGE_MODE .EQ. 1 ) THEN  ! (CppOnly=1 : SMATRIX1 is not called at all)
+c            CALL RESET_CUMULATIVE_VARIABLE()  ! mimic 'avoid bias of the initialization' within SMATRIX1
+c          ENDIF
+          CALL FBRIDGEGETNGOODHEL(FBRIDGE_PBRIDGE,NGOODHEL,NTOTHEL)
+          IF( NTOTHEL .NE. NCOMB ) THEN
+            WRITE(6,*) 'ERROR! Cudacpp/Fortran mismatch',
+     &        ' in total number of helicities', NTOTHEL, NCOMB
+            STOP
+          ENDIF
+          WRITE (6,*) 'NGOODHEL =', NGOODHEL
+          WRITE (6,*) 'NCOMB =', NCOMB
+          CALL COUNTERS_SMATRIX1MULTI_STOP( 1 )  ! cudacppHEL=1
+        ENDIF
+        CALL COUNTERS_SMATRIX1MULTI_START( 0, VECSIZE_USED )  ! cudacppMEs=0
+        IF ( .NOT. MULTI_CHANNEL ) THEN
+          CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE,  ! multi channel disabled
+     &      P_MULTI, ALL_G, IFLAV_VEC, HEL_RAND, COL_RAND,
+     &      OUT2, SELECTED_HEL2, SELECTED_COL2, .FALSE.)  ! do not quit after computing helicities
+        ELSE
+          IF( SDE_STRAT.NE.1 ) THEN
+            WRITE(6,*) 'ERROR  ! The cudacpp bridge requires SDE=1'  ! multi channel single-diagram enhancement strategy
+            STOP
+          ENDIF
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,  ! multi channel enabled
+     &      IFLAV_VEC, HEL_RAND, COL_RAND, CHANNELS, OUT2,
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.)  ! do not quit after computing helicities
+        ENDIF
+        CALL COUNTERS_SMATRIX1MULTI_STOP( 0 )  ! cudacppMEs=0
+      ENDIF
+
+      IF( FBRIDGE_MODE .LT. 0 ) THEN  ! (BothQuiet=-1 or BothDebug=-2)
+        DO IVEC=1, VECSIZE_USED
+          CBYF1 = OUT2(IVEC)/OUT(IVEC) - 1
+          FBRIDGE_NCBYF1 = FBRIDGE_NCBYF1 + 1
+          FBRIDGE_CBYF1SUM = FBRIDGE_CBYF1SUM + CBYF1
+          FBRIDGE_CBYF1SUM2 = FBRIDGE_CBYF1SUM2 + CBYF1 * CBYF1
+          IF( CBYF1 .GT. FBRIDGE_CBYF1MAX ) FBRIDGE_CBYF1MAX = CBYF1
+          IF( CBYF1 .LT. FBRIDGE_CBYF1MIN ) FBRIDGE_CBYF1MIN = CBYF1
+          IF( FBRIDGE_MODE .EQ. -2 ) THEN  ! (BothDebug=-2)
+            WRITE (*,'(I4,2E16.8,F23.11,I3,I3,I4,I4)')
+     &        IVEC, OUT(IVEC), OUT2(IVEC), 1+CBYF1,
+     &        SELECTED_HEL(IVEC), SELECTED_HEL2(IVEC),
+     &        SELECTED_COL(IVEC), SELECTED_COL2(IVEC)
+          ENDIF
+          IF( ABS(CBYF1).GT.5E-5 .AND. NWARNINGS.LT.20 ) THEN
+            NWARNINGS = NWARNINGS + 1
+            WRITE (*,'(A,I4,A,I4,2E16.8,F23.11)')
+     &        'WARNING! (', NWARNINGS, '/20) Deviation more than 5E-5',
+     &        IVEC, OUT(IVEC), OUT2(IVEC), 1+CBYF1
+          ENDIF
+        END DO
+      ENDIF
+
+      IF( FBRIDGE_MODE .EQ. 1 .OR. FBRIDGE_MODE .LT. 0 ) THEN  ! (CppOnly=1 or BothQuiet=-1 or BothDebug=-2)
+        DO IVEC=1, VECSIZE_USED
+          OUT(IVEC) = OUT2(IVEC)  ! use the cudacpp ME instead of the fortran ME!
+          SELECTED_HEL(IVEC) = SELECTED_HEL2(IVEC)  ! use the cudacpp helicity instead of the fortran helicity!
+          SELECTED_COL(IVEC) = SELECTED_COL2(IVEC)  ! use the cudacpp color instead of the fortran color!
+        END DO
+      ENDIF
+#endif
+
+      IF ( FIRST_CHID ) THEN
+        IF ( MULTI_CHANNEL ) THEN
+          WRITE (*,*) 'MULTI_CHANNEL = TRUE'
+        ELSE
+          WRITE (*,*) 'MULTI_CHANNEL = FALSE'
+        ENDIF
+        WRITE (*,*) 'CHANNEL_ID =', CHANNELS(1)
+        FIRST_CHID = .FALSE.
+      ENDIF
+c       ======================================================
+c       **END** Included from CUDACPP template smatrix_multi.f
+c       ======================================================
+
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
new file mode 100644
index 0000000000..ee16e9a952
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc
@@ -0,0 +1,511 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+//----------------------------------------------------------------------------
+// Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests
+//----------------------------------------------------------------------------
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "constexpr_math.h"
+#include "epoch_process_id.h"
+#include "valgrind.h"
+
+#include <gtest/gtest.h>
+
+//#include <quadmath.h>
+//#include <format> // needs C++20... https://stackoverflow.com/a/65347016
+#include <iomanip>
+#include <sstream>
+#include <typeinfo>
+
+#ifdef MGONGPUCPP_GPUIMPL
+#define TESTID( s ) s##_GPU_MISC
+#else
+#define TESTID( s ) s##_CPU_MISC
+#endif
+
+#define XTESTID( s ) TESTID( s )
+
+// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+#ifdef MGONGPU_CPPSIMD /* clang-format off */
+#define EXPECT_TRUE_sv( cond ) { bool_v mask( cond ); EXPECT_TRUE( maskand( mask ) ); }
+#else
+#define EXPECT_TRUE_sv( cond ) { EXPECT_TRUE( cond ); }
+#endif /* clang-format on */
+
+  inline const std::string
+  boolTF( const bool& b )
+  {
+    return ( b ? "T" : "F" );
+  }
+
+#ifdef MGONGPU_CPPSIMD
+  inline const std::string
+  boolTF( const bool_v& v )
+  {
+    std::stringstream out;
+    out << "{ " << ( v[0] ? "T" : "F" );
+    for( int i = 1; i < neppV; i++ ) out << ", " << ( v[i] ? "T" : "F" );
+    out << " }";
+    return out.str();
+  }
+#endif
+}
+
+TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  EXPECT_TRUE( true );
+
+  //--------------------------------------------------------------------------
+
+  // Vector initialization for fptype_sv
+  {
+    fptype_sv f{ 0 };
+    EXPECT_TRUE_sv( f == 0 );
+  }
+  {
+    fptype_sv f = fptype_sv{ 0 };
+    EXPECT_TRUE_sv( f == 0 );
+  }
+
+  // Vector initialization for fptype_sv - demonstrate bug #339 in older cxmake implementation
+  {
+    fptype_sv f{ 1 };
+    //std::cout << f << std::endl << boolTF( f == 1 ) << std::endl;
+    //EXPECT_TRUE_sv( f == 1 ); // this fails for vectors! TFFF
+#ifndef MGONGPU_CPPSIMD
+    EXPECT_TRUE_sv( f == 1 ); // this succeds: T
+#else
+    EXPECT_TRUE( ( f == 1 )[0] ); // this succeds: TFFF[0]
+    EXPECT_TRUE( ( f[0] == 1 ) );
+    for( int i = 1; i < neppV; i++ )
+    {
+      EXPECT_TRUE( !( ( f == 1 )[i] ) ); // this succeds: FTTT[i>=1]
+      EXPECT_TRUE( ( f[i] == 0 ) );      // equals 0, not 1
+    }
+#endif
+  }
+
+#ifdef MGONGPU_CPPSIMD
+  // Vector initialization for cxtype_sv - demonstrate fix for bug #339
+  {
+    fptype_sv f1 = fptype_v{ 0 } + 1;
+    EXPECT_TRUE_sv( f1 == 1 );
+    cxtype_v c12 = cxmake( f1, 2 );
+    //std::cout << c12 << std::endl << boolTF( c12.real() == 1 ) << std::endl << boolTF( c12.imag() == 2 ) << std::endl;
+    EXPECT_TRUE_sv( c12.real() == 1 );
+    EXPECT_TRUE_sv( c12.imag() == 2 );
+    cxtype_v c21 = cxmake( 2, f1 );
+    //std::cout << c21 << std::endl << boolTF( c21.real() == 2 ) << std::endl << boolTF( c21.imag() == 1 ) << std::endl;
+    EXPECT_TRUE_sv( c21.real() == 2 );
+    EXPECT_TRUE_sv( c21.imag() == 1 );
+  }
+#endif
+
+  // Vector initialization for cxtype_sv
+  {
+    cxtype_sv c = cxzero_sv();
+    EXPECT_TRUE_sv( c.real() == 0 );
+    EXPECT_TRUE_sv( c.imag() == 0 );
+  }
+  {
+    cxtype_sv c = cxmake( 1, fptype_sv{ 0 } ); // here was a bug #339
+    EXPECT_TRUE_sv( c.real() == 1 );
+    EXPECT_TRUE_sv( c.imag() == 0 );
+  }
+  {
+    cxtype_sv c = cxmake( fptype_sv{ 0 }, 1 ); // here was a bug #339
+    EXPECT_TRUE_sv( c.real() == 0 );
+    EXPECT_TRUE_sv( c.imag() == 1 );
+  }
+
+  // Array initialization for cxtype_sv array (example: jamp_sv in CPPProcess.cc)
+  {
+    cxtype_sv array[2] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxype is NOT, if "= {}" is missing!)
+    //std::cout << array[0].real() << std::endl; std::cout << boolTF( array[0].real() == 0 ) << std::endl;
+    EXPECT_TRUE_sv( array[0].real() == 0 );
+    EXPECT_TRUE_sv( array[0].imag() == 0 );
+    EXPECT_TRUE_sv( array[1].real() == 0 );
+    EXPECT_TRUE_sv( array[1].imag() == 0 );
+  }
+
+  // Alternative array initialization for cxtype_sv array (example: was used for outwf in testxxx.cc)
+  {
+    cxtype_sv array[2]{}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxype is NOT, if "{}" is missing!)
+    //std::cout << array[0].real() << std::endl; std::cout << boolTF( array[0].real() == 0 ) << std::endl;
+    EXPECT_TRUE_sv( array[0].real() == 0 );
+    EXPECT_TRUE_sv( array[0].imag() == 0 );
+    EXPECT_TRUE_sv( array[1].real() == 0 );
+    EXPECT_TRUE_sv( array[1].imag() == 0 );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Scalar complex references
+  {
+    using namespace mgOnGpu;
+    // Refs to f1, f2
+    fptype f1 = 1;
+    fptype f2 = 2;
+    cxtype_ref r12( f1, f2 ); // copy refs
+    //cxtype_ref r12a( r12 ); //deleted
+    cxtype_ref r12a( cxtype_ref( f1, f2 ) ); // copy refs
+    //cxtype_ref r12b = r12; // deleted
+    cxtype_ref r12b = cxtype_ref( f1, f2 ); // copy refs
+    EXPECT_TRUE( cxtype( r12 ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12 ).imag() == 2 );
+    EXPECT_TRUE( cxtype( r12a ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12a ).imag() == 2 );
+    EXPECT_TRUE( cxtype( r12b ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12b ).imag() == 2 );
+    // Refs to f1c, f2c
+    fptype f1c = 0;
+    fptype f2c = 0;
+    cxtype_ref r12c( f1c, f2c );
+    EXPECT_TRUE( cxtype( r12c ).real() == 0 );
+    EXPECT_TRUE( cxtype( r12c ).imag() == 0 );
+    //r12c = r12; // deleted
+    r12c = cxtype( r12 ); // copy values
+    EXPECT_TRUE( cxtype( r12c ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12c ).imag() == 2 );
+    // Update f1, f2
+    f1 = 10;
+    f2 = 20;
+    EXPECT_TRUE( cxtype( r12 ).real() == 10 );
+    EXPECT_TRUE( cxtype( r12 ).imag() == 20 );
+    EXPECT_TRUE( cxtype( r12a ).real() == 10 );
+    EXPECT_TRUE( cxtype( r12a ).imag() == 20 );
+    EXPECT_TRUE( cxtype( r12b ).real() == 10 );
+    EXPECT_TRUE( cxtype( r12b ).imag() == 20 );
+    EXPECT_TRUE( cxtype( r12c ).real() == 1 ); // points to f1c, not to f1
+    EXPECT_TRUE( cxtype( r12c ).imag() == 2 ); // points to f2c, not to f2
+  }
+
+  // Vector complex references
+  {
+    using namespace mgOnGpu;
+    // Refs to f1, f2
+    fptype_sv f1 = fptype_sv{ 0 } + 1;
+    fptype_sv f2 = fptype_sv{ 0 } + 2;
+    cxtype_sv_ref r12( f1, f2 ); // copy refs
+    //cxtype_sv_ref r12a( r12 ); //deleted
+    cxtype_sv_ref r12a( cxtype_sv_ref( f1, f2 ) ); // copy refs
+    //cxtype_sv_ref r12b = r12; // deleted
+    cxtype_sv_ref r12b = cxtype_sv_ref( f1, f2 ); // copy refs
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).imag() == 2 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).imag() == 2 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).imag() == 2 );
+    // Refs to f1c, f2c
+    fptype_sv f1c = fptype_sv{ 0 };
+    fptype_sv f2c = fptype_sv{ 0 };
+    cxtype_sv_ref r12c( f1c, f2c );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 0 );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 0 );
+    //r12c = r12; // deleted
+    r12c = cxtype_sv( r12 ); // copy values
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 2 );
+    // Update f1, f2
+    f1 = fptype_sv{ 0 } + 10;
+    f2 = fptype_sv{ 0 } + 20;
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).real() == 10 );
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).imag() == 20 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).real() == 10 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).imag() == 20 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).real() == 10 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).imag() == 20 );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 1 ); // points to f1c, not to f1
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 2 ); // points to f2c, not to f2
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Boolean vector (mask) times FP vector
+  /*
+  // From https://github.com/madgraph5/madgraph4gpu/issues/765#issuecomment-1853672838
+  channelids_sv = CHANNEL_ACCESS::kernelAccess( pchannelIds ); // the 4 channels in the SIMD vector
+  bool_sv mask_sv = ( channelids_sv == 1 );
+  numerators_sv += mask_sv * cxabs2( amp_sv[0] );
+  if( pchannelIds != nullptr ) denominators_sv += cxabs2( amp_sv[0] );
+  */
+  {
+    typedef bool_sv test_int_sv;  // defined as scalar_or_vector of long int (FPTYPE=double) or int (FPTYPE=float)
+    test_int_sv channelids0_sv{}; // mimic CHANNEL_ACCESS::kernelAccess( pchannelIds )
+    test_int_sv channelids1_sv{}; // mimic CHANNEL_ACCESS::kernelAccess( pchannelIds )
+    fptype_sv absamp0_sv{};       // mimic cxabs2( amp_sv[0] )
+    fptype_sv absamp1_sv{};       // mimic cxabs2( amp_sv[0] )
+#ifdef MGONGPU_CPPSIMD
+    for( int i = 0; i < neppV; i++ )
+    {
+      channelids0_sv[i] = i;   // 0123
+      channelids1_sv[i] = i;   // 1234
+      absamp0_sv[i] = 10. + i; // 10. 11. 12. 13.
+      absamp1_sv[i] = 11. + i; // 11. 12. 13. 14.
+    }
+#else
+    channelids0_sv = 0;
+    channelids1_sv = 1;
+    absamp0_sv = 10.;
+    absamp1_sv = 11.;
+#endif
+    bool_sv mask0_sv = ( channelids0_sv % 2 == 0 ); // even channels 0123 -> TFTF (1010)
+    bool_sv mask1_sv = ( channelids1_sv % 2 == 0 ); // even channels 1234 -> FTFT (0101)
+    constexpr fptype_sv fpZERO_sv{};                // 0000
+    //fptype_sv numerators0_sv = mask0_sv * absamp0_sv; // invalid operands to binary * ('__vector(4) long int' and '__vector(4) double')
+    fptype_sv numerators0_sv = fpternary( mask0_sv, absamp0_sv, fpZERO_sv ); // equivalent to "mask0_sv * absamp0_sv"
+    fptype_sv numerators1_sv = fpternary( mask1_sv, absamp1_sv, fpZERO_sv ); // equivalent to "mask1_sv * absamp1_sv"
+#ifdef MGONGPU_CPPSIMD
+    //std::cout << "numerators0_sv: " << numerators0_sv << std::endl;
+    //std::cout << "numerators1_sv: " << numerators1_sv << std::endl;
+    for( int i = 0; i < neppV; i++ )
+    {
+      // Values of numerators0_sv: 10.*1 11.*0 12.*1 13.*0
+      if( channelids0_sv[i] % 2 == 0 ) // even channels
+        EXPECT_TRUE( numerators0_sv[i] == ( 10. + i ) );
+      else // odd channels
+        EXPECT_TRUE( numerators0_sv[i] == 0. );
+      // Values of numerators1_sv: 11.*0 12.*1 13.*0 14.*1
+      if( channelids1_sv[i] % 2 == 0 ) // even channels
+        EXPECT_TRUE( numerators1_sv[i] == ( 11. + i ) );
+      else // odd channels
+        EXPECT_TRUE( numerators1_sv[i] == 0. );
+    }
+#else
+    // Values of numerators0_sv: 10.*1
+    EXPECT_TRUE( numerators0_sv == 10. );
+    // Values of numerators1_sv: 11.*0
+    EXPECT_TRUE( numerators1_sv == 0. );
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Test constexpr floor
+  EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 );
+  EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 );
+  EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 );
+  EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 );
+
+  // Test constexpr pow
+  EXPECT_TRUE( constexpr_pow( 10, 0 ) == 1 );
+  EXPECT_TRUE( constexpr_pow( 10, 1 ) == 10 );
+  EXPECT_TRUE( constexpr_pow( 10, 2 ) == 100 );
+  EXPECT_NEAR( constexpr_pow( 10, -1 ), 0.1, 0.1 * 1E-14 )
+    << std::setprecision( 40 ) << "constexpr_pow( 10, -1 ) = " << constexpr_pow( 10, -1 );
+  EXPECT_NEAR( constexpr_pow( 10, -2 ), 0.01, 0.01 * 1E-14 )
+    << std::setprecision( 40 ) << "constexpr_pow( 10, -2 ) = " << constexpr_pow( 10, -2 );
+  EXPECT_NEAR( constexpr_pow( 100, 0.5 ), 10, 10 * 1E-14 )
+    << std::setprecision( 40 ) << "constexpr_pow( 100, 0.5 ) = " << constexpr_pow( 100, 0.5 );
+  EXPECT_NEAR( constexpr_pow( 100, -0.5 ), 0.1, 0.1 * 1E-14 )
+    << std::setprecision( 40 ) << "constexpr_pow( 100, -0.5 ) = " << constexpr_pow( 100, -0.5 );
+  EXPECT_NEAR( constexpr_pow( 10000, 0.25 ), 10, 10 * 1E-14 )
+    << std::setprecision( 40 ) << "constexpr_pow( 10000, 0.25 ) = " << constexpr_pow( 10000, 0.25 );
+  EXPECT_NEAR( constexpr_pow( 10000, -0.25 ), 0.1, 0.1 * 1E-14 )
+    << std::setprecision( 40 ) << "constexpr_pow( 10000, -0.25 ) = " << constexpr_pow( 10000, -0.25 );
+
+  // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2)
+  auto distance4 = []( const long double xx )
+  {
+    const long double xx2 = mapIn0to2Pi( xx );                                                    // in [0,2*pi)
+    const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2)
+    const long double d0 = xx3;                                                                   // distance from 0
+    const long double d1 = constexpr_pi_by_2 - xx3;                                               // distance from pi/2
+    return ( d0 < d1 ? d0 : d1 );
+  };
+
+  // Test constexpr sin, cos, tan - specific, problematic, points
+  auto testSinCosTanX = []( const long double xx, const double tolerance0, const bool debug = false, const long long istep = -999999999 )
+  {
+    const double x = (double)xx;
+    const double tolerance = tolerance0 * ( !RUNNING_ON_VALGRIND ? 1 : 1100 ); // higher tolerance when running through valgrind #906
+    if( debug )
+    {
+      //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl;
+      //std::cout << std::setprecision(40) << "                x=  " << x << std::endl;
+    }
+    //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl;
+    //int width = 46;
+    //char buf[128];
+    //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx );
+    //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl;
+    //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x );
+    //std::cout << std::setprecision(40) << "                x= " << buf << std::endl;
+    EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) )
+      << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+    EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) )
+      << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+    if( !RUNNING_ON_VALGRIND )
+    {
+      EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) )
+        << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+    }
+    else
+    {
+      // Higher tolerance when running through valgrind #906
+      const long double ctanx = constexpr_tan( x );
+      const long double taninf = 4E14; // declare tan(x) as "infinity if above this threshold
+      if( ctanx > -taninf && ctanx < taninf )
+        EXPECT_NEAR( std::tan( x ), ctanx, std::abs( std::tan( x ) * tolerance ) )
+          << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+      else
+      {
+        // Allow tan(x)=-inf if ctan(x)=+inf and viceversa
+        EXPECT_GT( std::abs( std::tan( x ) ), taninf )
+          << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+        /*
+        // Require tan(x)=+inf if ctan(x)=+inf and similarly for -inf (this fails around 3*pi/2)
+        if( ctanx > 0 )
+          EXPECT_GT( std::tan( x ), taninf )
+            << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+        else
+          EXPECT_LT( std::tan( x ), -taninf )
+            << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep;
+        */
+      }
+    }
+    std::cout << std::setprecision( 6 ); // default
+  };
+  testSinCosTanX( constexpr_pi, 1E-3, true );                                         // from math.h
+  testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h
+  testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true );           // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx)
+  testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true );                    // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx)
+  testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true );                  // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx)
+
+  // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi]
+  auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance )
+  {
+    for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1
+    {
+      const bool debug = false;
+      const int nstep = 8;
+      for( int istep = 0; istep < nstep + 1; istep++ )
+      {
+        long double x0 = deltax * ioff;
+        long double x1 = deltax * ioff + 2 * constexpr_pi;
+        double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double)
+        testSinCosTanX( x, tolerance, debug, istep );
+      }
+    }
+  };
+
+  // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi]
+  // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi]
+  testSinCosTan8( 0, 1E-03 );     // fails with 1E-04 - DANGEROUS ANYWAY...
+  testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY...
+  testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05
+  testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07
+  testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10
+  testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13
+  testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac)
+  testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac)
+
+  // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance
+  auto testSinCosTanN = [distance4]( const int nstep, const double x0, const double x1 )
+  {
+    auto toleranceForX = [distance4]( const double x )
+    {
+      const double d4 = distance4( x );
+      if( d4 < 1E-14 )
+        return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3...
+      else if( d4 < 1E-13 )
+        return 1E-04;
+      else if( d4 < 1E-12 )
+        return 1E-05;
+      else if( d4 < 1E-11 )
+        return 1E-06;
+      else if( d4 < 1E-10 )
+        return 1E-07;
+      else if( d4 < 1E-09 )
+        return 1E-08;
+      else if( d4 < 1E-08 )
+        return 1E-09;
+      else if( d4 < 1E-07 )
+        return 1E-10;
+      else if( d4 < 1E-06 )
+        return 1E-11;
+      else if( d4 < 1E-05 )
+        return 1E-12;
+      else if( d4 < 1E-04 )
+        return 1E-13;
+      else
+        return 1E-14; // play it safe even if the agreement might even be better?
+    };
+    for( int istep = 0; istep < nstep + 1; istep++ )
+    {
+      double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double)
+      const double tolerance0 = toleranceForX( x );
+      const double tolerance = tolerance0 * ( !RUNNING_ON_VALGRIND ? 1 : 1100 ); // higher tolerance when running through valgrind #906
+      EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) )
+        << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+      EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) )
+        << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+      if( !RUNNING_ON_VALGRIND )
+      {
+        EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) )
+          << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+      }
+      else
+      {
+        // Higher tolerance when running through valgrind #906
+        const long double ctanx = constexpr_tan( x );
+        const long double taninf = 4E14; // declare tan(x) as "infinity if above this threshold
+        if( ctanx > -taninf && ctanx < taninf )
+          EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) )
+            << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+        else
+        {
+          // Allow tan(x)=-inf if ctan(x)=+inf and viceversa
+          EXPECT_GT( std::abs( std::tan( x ) ), taninf )
+            << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+          /*
+          // Require tan(x)=+inf if ctan(x)=+inf and similarly for -inf (this fails around 3*pi/2)
+          if( ctanx > 0 )
+            EXPECT_GT( std::tan( x ), taninf )
+              << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+          else
+            EXPECT_LT( std::tan( x ), -taninf )
+              << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x );
+          */
+        }
+      }
+    }
+  };
+  testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15)
+  testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 );
+
+  // Test constexpr atan
+  {
+    const double tolerance = 1E-12;
+    const int nstep = 1000;
+    for( int istep = 0; istep < nstep + 1; istep++ )
+    {
+      long double x0 = -5, x1 = +5;
+      double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double)
+      EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) )
+        << "x=" << x << ", istep=" << istep;
+    }
+  }
+
+  //--------------------------------------------------------------------------
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
new file mode 100644
index 0000000000..3112dd3dee
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -0,0 +1,455 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+//----------------------------------------------------------------------------
+// Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
+//----------------------------------------------------------------------------
+
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "HelAmps_%(model_name)s.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessWavefunctions.h"
+#include "MemoryBuffers.h"
+#include "epoch_process_id.h"
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <cassert>
+#include <cfenv> // for signal and SIGFPE (see https://stackoverflow.com/a/17473528)
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#ifdef MGONGPUCPP_GPUIMPL
+#define TESTID( s ) s##_GPU_XXX
+#else
+#define TESTID( s ) s##_CPU_XXX
+#endif
+
+#define XTESTID( s ) TESTID( s )
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  std::string fpeHandlerMessage = "unknown";
+  int fpeHandlerIevt = -1;
+  inline void fpeHandlerTestxxx( int /*sig*/ )
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    std::cerr << "Floating Point Exception (GPU): '" << fpeHandlerMessage << "' ievt=" << fpeHandlerIevt << std::endl;
+#else
+    std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << fpeHandlerMessage << "' ievt=" << fpeHandlerIevt << std::endl;
+#endif
+    exit( 1 );
+  }
+}
+
+TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+#ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
+  auto fpeHandlerDefault = signal( SIGFPE, fpeHandlerTestxxx );
+#endif
+  constexpr bool dumpEvents = false;       // dump the expected output of the test?
+  constexpr bool testEvents = !dumpEvents; // run the test?
+  constexpr fptype toleranceXXXs = std::is_same<fptype, double>::value ? 1.E-15 : 1.E-5;
+  // Constant parameters
+  constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+  constexpr int np4 = CPPProcess::np4;
+  const int nevt = 32;         // 12 independent tests plus 20 duplicates (need a multiple of 16 for floats '512z')
+  assert( nevt %% neppM == 0 ); // nevt must be a multiple of neppM
+  assert( nevt %% neppV == 0 ); // nevt must be a multiple of neppV
+  // Fill in the input momenta
+#ifdef MGONGPUCPP_GPUIMPL
+  mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
+#else
+  mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
+#endif /* clang-format off */
+  // NB NEW TESTS FOR DEBUGGING #701: KEEP TWO SEPARATE SETS (16-SIMD-VECTORS!) OF TESTS FOR M==0 AND M!=0!
+  const fptype par0[np4 * nevt] = // AOS[nevt][np4]
+    {
+      500, 0, 0, 500,      // #0  (m=0 pT=0 E=pz>0)
+      500, 0, 0, -500,     // #1  (m=0 pT=0 -E=pz<0)
+      500, 300, 400, 0,    // #2  (m=0 pT>0 pz=0)
+      500, 180, 240, 400,  // #3  (m=0 pT>0 pz>0)
+      500, 180, 240, -400, // #4  (m=0 pT>0 pz<0)
+      500, 0, 0, 500,      // #5  DUPLICATE == #0 (m=0 pT=0 E=pz>0)
+      500, 0, 0, -500,     // #6  DUPLICATE == #1 (m=0 pT=0 -E=pz<0)
+      500, 300, 400, 0,    // #7  DUPLICATE == #2 (m=0 pT>0 pz=0)
+      500, 180, 240, 400,  // #8  DUPLICATE == #3 (m=0 pT>0 pz>0)
+      500, 180, 240, -400, // #9  DUPLICATE == #4 (m=0 pT>0 pz<0)
+      500, 0, 0, 500,      // #10 DUPLICATE == #0 (m=0 pT=0 E=pz>0)
+      500, 0, 0, -500,     // #11 DUPLICATE == #1 (m=0 pT=0 -E=pz<0)
+      500, 300, 400, 0,    // #12 DUPLICATE == #2 (m=0 pT>0 pz=0)
+      500, 180, 240, 400,  // #13 DUPLICATE == #3 (m=0 pT>0 pz>0)
+      500, 180, 240, -400, // #14 DUPLICATE == #4 (m=0 pT>0 pz<0)
+      500, 0, 0, 500,      // #15 DUPLICATE == #0 (m=0 pT=0 E=pz>0)
+      500, 0, 0, 0,        // #16 (m=50>0 pT=0 pz=0)
+      500, 0, 0, 300,      // #17 (m=40>0 pT=0 pz>0)
+      500, 0, 0, -300,     // #18 (m=40>0 pT=0 pz<0)
+      500, 180, 240, 0,    // #19 (m=40>0 pT>0 pz=0)
+      500, -240, -180, 0,  // #20 (m=40>0 pT>0 pz=0)
+      500, 180, 192, 144,  // #21 (m=40>0 pT>0 pz>0)
+      500, 180, 192, -144, // #22 (m=40>0 pT>0 pz<0)
+      500, 0, 0, 0,        // #23 DUPLICATE == #16 (m=50>0 pT=0 pz=0)
+      500, 0, 0, 300,      // #24 DUPLICATE == #17 (m=40>0 pT=0 pz>0)
+      500, 0, 0, -300,     // #25 DUPLICATE == #18 (m=40>0 pT=0 pz<0)
+      500, 180, 240, 0,    // #26 DUPLICATE == #19 (m=40>0 pT>0 pz=0)
+      500, -240, -180, 0,  // #27 DUPLICATE == #20 (m=40>0 pT>0 pz=0)
+      500, 180, 192, 144,  // #28 DUPLICATE == #21 (m=40>0 pT>0 pz>0)
+      500, 180, 192, -144, // #29 DUPLICATE == #22 (m=40>0 pT>0 pz<0)
+      500, 0, 0, 0,        // #30 DUPLICATE == #16 (m=50>0 pT=0 pz=0)
+      500, 0, 0, 300       // #31 DUPLICATE == #17 (m=40>0 pT=0 pz>0)
+    }; /* clang-format on */
+  // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only)
+  // See https://en.cppreference.com/w/c/language/array_initialization#Notes
+  fptype mass0[nevt] = {};
+  bool ispzgt0[nevt] = {};
+  bool ispzlt0[nevt] = {};
+  bool isptgt0[nevt] = {};
+  for( int ievt = 0; ievt < nevt; ievt++ )
+  {
+    const fptype p0 = par0[ievt * np4 + 0];
+    const fptype p1 = par0[ievt * np4 + 1];
+    const fptype p2 = par0[ievt * np4 + 2];
+    const fptype p3 = par0[ievt * np4 + 3];
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
+    ispzgt0[ievt] = ( p3 > 0 );
+    ispzlt0[ievt] = ( p3 < 0 );
+    isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
+  }
+  const int ipar0 = 0; // use only particle0 for this test
+  for( int ievt = 0; ievt < nevt; ievt++ )
+  {
+    for( int ip4 = 0; ip4 < np4; ip4++ )
+    {
+      MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), ievt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA
+    }
+  }
+  // Expected output wavefunctions
+  std::vector<std::array<fptype, 12>> expwfs;
+#include "testxxx_cc_ref.txt" // expwfs.push_back( {...} );
+  std::string dumpFileName = "testxxx_cc_ref.txt.new";
+  // Compute the output wavefunctions
+  // Dump new reference file if requested
+  constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  int itest = 0;                       // index on the expected output vector
+  std::ofstream dumpFile;
+  if( dumpEvents )
+  {
+    dumpFile.open( dumpFileName, std::ios::trunc );
+    dumpFile << "  // Copyright (C) 2020-2024 CERN and UCLouvain." << std::endl
+             << "  // Licensed under the GNU Lesser General Public License (version 3 or later)." << std::endl
+             << "  // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin." << std::endl
+             << "  // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin." << std::endl;
+  }
+  // Lambda function for dumping wavefunctions
+  auto dumpwf6 = [&]( std::ostream& out, const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass )
+  {
+    out << std::setprecision( 15 ) << std::scientific;
+    out << "  expwfs.push_back( {";
+    out << "                                   // ---------" << std::endl;
+    for( int iw6 = 0; iw6 < nw6; iw6++ )
+    {
+#ifdef MGONGPU_CPPSIMD
+      const int ieppV = ievt %% neppV; // #event in the current event vector in this iteration
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+      out << std::setw( 26 ) << cxreal( wf[iw6][ieppV] ) << ", ";
+      out << std::setw( 22 ) << cximag( wf[iw6][ieppV] );
+#else
+      out << std::setw( 26 ) << wf[iw6].real()[ieppV] << ", ";
+      out << std::setw( 22 ) << wf[iw6].imag()[ieppV];
+#endif
+#else
+      out << std::setw( 26 ) << wf[iw6].real();
+      out << ", " << std::setw( 22 ) << wf[iw6].imag();
+#endif
+      if( iw6 < nw6 - 1 )
+        out << ",    ";
+      else
+        out << " } );";
+      out << " // itest=" << itest << ": " << xxx << "#" << ievt;
+      out << " nsp=" << nsp << " mass=" << (int)mass << std::endl;
+    }
+    out << std::defaultfloat;
+  };
+  // Lambda function for testing wavefunctions (1)
+  auto testwf6 = [&]( const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass )
+  {
+    if( dumpEvents ) dumpwf6( dumpFile, wf, xxx, ievt, nsp, mass );
+    if( testEvents )
+    {
+      std::array<fptype, 12>& expwf = expwfs[itest];
+      //std::cout << "Testing " << std::setw(3) << itest << ": " << xxx << " #" << ievt << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ ) std::cout << wf[iw6] << std::endl;
+      ////std::cout << "against" << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ )
+      ////  std::cout << "[" << expwf[iw6*2] << "," << expwf[iw6*2+1] << "]" << std::endl; // NB: expwf[iw6*2], expwf[iw6*2+1] are fp
+      for( int iw6 = 0; iw6 < nw6; iw6++ )
+      {
+        const fptype expReal = expwf[iw6 * 2];
+        const fptype expImag = expwf[iw6 * 2 + 1];
+        if( true )
+        {
+#ifdef MGONGPU_CPPSIMD
+          const int ieppV = ievt %% neppV; // #event in the current event vector in this iteration
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+          EXPECT_NEAR( cxreal( wf[iw6][ieppV] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+          EXPECT_NEAR( cximag( wf[iw6][ieppV] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+#else
+          EXPECT_NEAR( wf[iw6].real()[ieppV], expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+          EXPECT_NEAR( wf[iw6].imag()[ieppV], expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+#endif
+#else
+          EXPECT_NEAR( cxreal( wf[iw6] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+          EXPECT_NEAR( cximag( wf[iw6] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+#endif
+        }
+      }
+    }
+    itest++;
+  };
+  // Lambda function for testing wavefunctions (2)
+  auto testwf6two = [&]( const cxtype_sv wf[6], const cxtype_sv expwf[6], const char* xxx, int ievt )
+  {
+    if( testEvents )
+    {
+      const std::string xxxFull( xxx[0] == 'i' ? "ixxxxx" : "oxxxxx" );
+      //std::cout << "Testing " << std::setw(3) << itest << ": ";
+      //std::cout << xxx << " #" << ievt << " against " << xxxFull << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ ) std::cout << wf[iw6] << std::endl;
+      ////std::cout << "against" << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ ) std::cout << expwf[iw6] << std::endl; // NB: expwf[iw6] is cx
+      for( int iw6 = 0; iw6 < nw6; iw6++ )
+      {
+        if( true )
+        {
+#ifdef MGONGPU_CPPSIMD
+          const int ieppV = ievt %% neppV; // #event in the current event vector in this iteration
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+          const fptype expReal = cxreal( expwf[iw6][ieppV] );
+          const fptype expImag = cximag( expwf[iw6][ieppV] );
+          EXPECT_NEAR( cxreal( wf[iw6][ieppV] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+          EXPECT_NEAR( cximag( wf[iw6][ieppV] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+#else
+          const fptype expReal = expwf[iw6].real()[ieppV];
+          const fptype expImag = expwf[iw6].imag()[ieppV];
+          EXPECT_NEAR( wf[iw6].real()[ieppV], expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+          EXPECT_NEAR( wf[iw6].imag()[ieppV], expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+#endif
+#else
+          const fptype expReal = cxreal( expwf[iw6] );
+          const fptype expImag = cximag( expwf[iw6] );
+          EXPECT_NEAR( cxreal( wf[iw6] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+          EXPECT_NEAR( cximag( wf[iw6] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+#endif
+        }
+      }
+    }
+  };
+  // Lambda function for resetting hstMomenta to the values of par0
+  // This is needed in each test because hstMomenta may have been modified to ensure a function like ipzxxx can be used (#701)
+  auto resetHstMomentaToPar0 = [&]()
+  {
+    for( int ievt = 0; ievt < nevt; ievt++ )
+      for( int ip4 = 0; ip4 < np4; ip4++ )
+        MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), ievt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA
+  };
+  // Lambda function for preparing the test of one specific function
+  const bool debug = false;
+  auto prepareTest = [&]( const char* xxx, int ievt )
+  {
+    if( debug ) std::cout << "Prepare test " << xxx << " ievt=" << ievt << std::endl;
+    resetHstMomentaToPar0();
+    fpeHandlerMessage = xxx;
+    fpeHandlerIevt = ievt;
+    if( std::string( xxx ) == "ipzxxx" || std::string( xxx ) == "opzxxx" || std::string( xxx ) == "imzxxx" || std::string( xxx ) == "omzxxx" || std::string( xxx ) == "ixzxxx" || std::string( xxx ) == "oxzxxx" )
+    {
+      // Modify hstMomenta so that ALL events have the momenta of a single ievt
+      // This ensures that a function like ipzxxx (which assumes pZ>0) can be used without triggering FPEs (#701)
+      // This is done by filling the full SIMD vector with the value of ievt, which was already tested to respect the relevant assumptions
+      for( int jevt = 0; jevt < nevt; jevt++ )
+        for( int ip4 = 0; ip4 < np4; ip4++ )
+          MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), jevt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA
+    }
+  };
+  // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only)
+  // See https://en.cppreference.com/w/c/language/array_initialization#Notes
+  cxtype_sv outwfI[6] = {}; // last result of ixxxxx (mass==0)
+  cxtype_sv outwfO[6] = {}; // last result of oxxxxx (mass==0)
+  cxtype_sv outwf[6] = {};
+  cxtype_sv outwf3[6] = {};                                // NB: only 3 are filled by sxxxxx, but 6 are compared!
+  fptype* fp_outwfI = reinterpret_cast<fptype*>( outwfI ); // proof of concept for using fptype* in the interface
+  fptype* fp_outwfO = reinterpret_cast<fptype*>( outwfO ); // proof of concept for using fptype* in the interface
+  fptype* fp_outwf = reinterpret_cast<fptype*>( outwf );   // proof of concept for using fptype* in the interface
+  fptype* fp_outwf3 = reinterpret_cast<fptype*>( outwf3 ); // proof of concept for using fptype* in the interface
+  const int nhel = 1;
+  // *** START OF TESTING LOOP
+  for( auto nsp: { -1, +1 } ) // antifermion/fermion (or initial/final for scalar and vector)
+  {
+    for( int ievt = 0; ievt < nevt; ievt++ )
+    {
+#ifdef MGONGPUCPP_GPUIMPL
+      using namespace mg5amcGpu;
+#else
+      using namespace mg5amcCpu;
+#endif
+      if( debug )
+      {
+        std::cout << std::endl;
+        std::cout << "nsp=" << nsp << " ievt=" << ievt << ": ";
+        for( int ip4 = 0; ip4 < np4; ip4++ ) std::cout << par0[ievt * np4 + ip4] << ", ";
+        std::cout << std::endl;
+      }
+      const int ipagV = ievt / neppV; // #event vector in this iteration
+      const fptype* ievt0Momenta = MemoryAccessMomenta::ieventAccessRecordConst( hstMomenta.data(), ipagV * neppV );
+      // Test ixxxxx - NO ASSUMPTIONS
+      {
+        prepareTest( "ixxxxx", ievt );
+        const fptype fmass = mass0[ievt];
+        ixxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, fmass, nhel, nsp, fp_outwfI, ipar0 );
+        testwf6( outwfI, "ixxxxx", ievt, nsp, fmass );
+        ixxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, -fmass, nhel, nsp, fp_outwfI, ipar0 );
+        testwf6( outwfI, "ixxxxx", ievt, nsp, -fmass );
+      }
+      // Test ipzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] )
+      {
+        prepareTest( "ipzxxx", ievt );
+        ipzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfI, "ipzxxx", ievt );
+        testwf6( outwf, "ipzxxx", ievt, nsp, 0 );
+      }
+      // Test imzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] )
+      {
+        prepareTest( "imzxxx", ievt );
+        imzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfI, "imzxxx", ievt );
+        testwf6( outwf, "imzxxx", ievt, nsp, 0 );
+      }
+      // Test ixzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+      if( mass0[ievt] == 0 && isptgt0[ievt] )
+      {
+        prepareTest( "ixzxxx", ievt );
+        ixzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfI, "ixzxxx", ievt );
+        testwf6( outwf, "ixzxxx", ievt, nsp, 0 );
+      }
+      // Test vxxxxx - NO ASSUMPTIONS
+      {
+        prepareTest( "vxxxxx", ievt );
+        const fptype vmass = mass0[ievt];
+        vxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, vmass, nhel, nsp, fp_outwf, ipar0 );
+        testwf6( outwf, "vxxxxx", ievt, nsp, vmass );
+        vxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, -vmass, nhel, nsp, fp_outwf, ipar0 );
+        testwf6( outwf, "vxxxxx", ievt, nsp, -vmass );
+      }
+      // Test sxxxxx - NO ASSUMPTIONS
+      {
+        prepareTest( "sxxxxx", ievt );
+        const fptype smass = mass0[ievt];
+        sxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass>0")
+        testwf6( outwf3, "sxxxxx", ievt, nsp, smass );
+        sxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass<0")
+        testwf6( outwf3, "sxxxxx", ievt, nsp, -smass );
+      }
+      // Test oxxxxx - NO ASSUMPTIONS
+      {
+        prepareTest( "oxxxxx", ievt );
+        const fptype fmass = mass0[ievt];
+        oxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, fmass, nhel, nsp, fp_outwfO, ipar0 );
+        testwf6( outwfO, "oxxxxx", ievt, nsp, fmass );
+        oxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, -fmass, nhel, nsp, fp_outwfO, ipar0 );
+        testwf6( outwfO, "oxxxxx", ievt, nsp, -fmass );
+      }
+      // Test opzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] )
+      {
+        prepareTest( "opzxxx", ievt );
+        opzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfO, "opzxxx", ievt );
+        testwf6( outwf, "opzxxx", ievt, nsp, 0 );
+      }
+      // Test omzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] )
+      {
+        prepareTest( "omzxxx", ievt );
+        omzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfO, "omzxxx", ievt );
+        testwf6( outwf, "omzxxx", ievt, nsp, 0 );
+      }
+      // Test oxzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+      if( mass0[ievt] == 0 && isptgt0[ievt] )
+      {
+        prepareTest( "oxzxxx", ievt );
+        oxzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfO, "oxzxxx", ievt );
+        testwf6( outwf, "oxzxxx", ievt, nsp, 0 );
+      }
+    }
+  }
+  // *** END OF TESTING LOOP
+  if( dumpEvents )
+  {
+    dumpFile.close();
+    std::cout << "INFO: New reference data dumped to file '" << dumpFileName << "'" << std::endl;
+  }
+#ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730)
+  signal( SIGFPE, fpeHandlerDefault );
+#endif
+}
+
+//==========================================================================
+
+// Reset the GPU after ALL tests have gone out of scope
+// (This was needed to avoid leaks in profilers, but compute-sanitizer reports no leaks, is it STILL needed?)
+// ========= NB: resetting the GPU too early causes segfaults that are very difficult to debug #907 =========
+// Try to use atexit (https://stackoverflow.com/a/14610501) but this still crashes!
+// ********* FIXME? avoid CUDA API calls in destructors? (see https://stackoverflow.com/a/16982503) *********
+void
+myexit()
+{
+#ifdef MGONGPUCPP_GPUIMPL
+  //checkGpu( gpuDeviceReset() ); // FIXME??? this still crashes! should systematically avoid CUDA calls in all destructors?
+#endif
+}
+
+// Main function (see https://google.github.io/googletest/primer.html#writing-the-main-function)
+// (NB: the test executables are now separate for C++ and CUDA, therefore main must be included all the time)
+// (NB: previously, '#ifndef MGONGPUCPP_GPUIMPL' was ensuring that main was only included once while linking both C++ and CUDA tests)
+int
+main( int argc, char** argv )
+{
+  atexit( myexit );
+  testing::InitGoogleTest( &argc, argv );
+  int status = RUN_ALL_TESTS();
+  return status;
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx_cc_ref.txt b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx_cc_ref.txt
new file mode 100644
index 0000000000..c1c4ca5766
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx_cc_ref.txt
@@ -0,0 +1,4036 @@
+  // Copyright (C) 2020-2024 CERN and UCLouvain.
+  // Licensed under the GNU Lesser General Public License (version 3 or later).
+  // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
+  // Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=0: ixxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=1: ixxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=2: ipzxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=3: vxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=4: vxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=5: sxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=6: sxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=7: oxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=8: oxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=9: opzxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=10: ixxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=11: ixxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=12: imzxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=13: vxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=14: vxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=15: sxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=16: sxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=17: oxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=18: oxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=19: omzxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=20: ixxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=21: ixxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=22: ixzxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=23: vxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=24: vxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=25: sxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=26: sxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=27: oxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=28: oxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=29: oxzxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=30: ixxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=31: ixxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=32: ixzxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=33: vxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=34: vxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=35: sxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=36: sxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=37: oxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=38: oxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=39: oxzxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=40: ixxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=41: ixxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=42: ixzxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=43: vxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=44: vxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=45: sxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=46: sxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=47: oxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=48: oxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=49: oxzxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=50: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=50: ixxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=51: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=51: ixxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=52: ipzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=52: ipzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=52: ipzxxx#5 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=52: ipzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=52: ipzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=52: ipzxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=53: vxxxxx#5 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=53: vxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=53: vxxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=54: vxxxxx#5 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=54: vxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=54: vxxxxx#5 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=54: vxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=54: vxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=54: vxxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=55: sxxxxx#5 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=55: sxxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=56: sxxxxx#5 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=56: sxxxxx#5 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=56: sxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=56: sxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=56: sxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=56: sxxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=57: oxxxxx#5 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=57: oxxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=58: oxxxxx#5 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=58: oxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=58: oxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=58: oxxxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=58: oxxxxx#5 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=58: oxxxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=59: opzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=59: opzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=59: opzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=59: opzxxx#5 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=59: opzxxx#5 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=59: opzxxx#5 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=60: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=60: ixxxxx#6 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=60: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=60: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=60: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=60: ixxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=61: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=61: ixxxxx#6 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=61: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=61: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=61: ixxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=61: ixxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=62: imzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=62: imzxxx#6 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=62: imzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=62: imzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=62: imzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=62: imzxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=63: vxxxxx#6 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=63: vxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=63: vxxxxx#6 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=63: vxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=63: vxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=63: vxxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=64: vxxxxx#6 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=64: vxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=64: vxxxxx#6 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=64: vxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=64: vxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=64: vxxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=65: sxxxxx#6 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=65: sxxxxx#6 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=65: sxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=65: sxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=65: sxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=65: sxxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=66: sxxxxx#6 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=66: sxxxxx#6 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=66: sxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=66: sxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=66: sxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=66: sxxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=67: oxxxxx#6 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=67: oxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=67: oxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=67: oxxxxx#6 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=67: oxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=67: oxxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=68: oxxxxx#6 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=68: oxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=68: oxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=68: oxxxxx#6 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=68: oxxxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=68: oxxxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=69: omzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=69: omzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=69: omzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=69: omzxxx#6 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=69: omzxxx#6 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=69: omzxxx#6 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=70: ixxxxx#7 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=70: ixxxxx#7 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=70: ixxxxx#7 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=70: ixxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=70: ixxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=70: ixxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=71: ixxxxx#7 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=71: ixxxxx#7 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=71: ixxxxx#7 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=71: ixxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=71: ixxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=71: ixxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=72: ixzxxx#7 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=72: ixzxxx#7 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=72: ixzxxx#7 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=72: ixzxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=72: ixzxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=72: ixzxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=73: vxxxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=73: vxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=73: vxxxxx#7 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=73: vxxxxx#7 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=73: vxxxxx#7 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=73: vxxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=74: vxxxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=74: vxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=74: vxxxxx#7 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=74: vxxxxx#7 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=74: vxxxxx#7 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=74: vxxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=75: sxxxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=75: sxxxxx#7 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=75: sxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=75: sxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=75: sxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=75: sxxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=76: sxxxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=76: sxxxxx#7 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=76: sxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=76: sxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=76: sxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=76: sxxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=77: oxxxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=77: oxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=77: oxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=77: oxxxxx#7 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01,     // itest=77: oxxxxx#7 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=77: oxxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=78: oxxxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=78: oxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=78: oxxxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=78: oxxxxx#7 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01,     // itest=78: oxxxxx#7 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=78: oxxxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=79: oxzxxx#7 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=79: oxzxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=79: oxzxxx#7 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=79: oxzxxx#7 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=79: oxzxxx#7 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=79: oxzxxx#7 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=80: ixxxxx#8 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=80: ixxxxx#8 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=80: ixxxxx#8 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=80: ixxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=80: ixxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=80: ixxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=81: ixxxxx#8 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=81: ixxxxx#8 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=81: ixxxxx#8 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=81: ixxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=81: ixxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=81: ixxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=82: ixzxxx#8 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=82: ixzxxx#8 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=82: ixzxxx#8 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=82: ixzxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=82: ixzxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=82: ixzxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=83: vxxxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=83: vxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=83: vxxxxx#8 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=83: vxxxxx#8 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=83: vxxxxx#8 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=83: vxxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=84: vxxxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=84: vxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=84: vxxxxx#8 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=84: vxxxxx#8 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=84: vxxxxx#8 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=84: vxxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=85: sxxxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=85: sxxxxx#8 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=85: sxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=85: sxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=85: sxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=85: sxxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=86: sxxxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=86: sxxxxx#8 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=86: sxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=86: sxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=86: sxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=86: sxxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=87: oxxxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=87: oxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=87: oxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=87: oxxxxx#8 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=87: oxxxxx#8 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=87: oxxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=88: oxxxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=88: oxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=88: oxxxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=88: oxxxxx#8 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=88: oxxxxx#8 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=88: oxxxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=89: oxzxxx#8 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=89: oxzxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=89: oxzxxx#8 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=89: oxzxxx#8 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=89: oxzxxx#8 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=89: oxzxxx#8 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=90: ixxxxx#9 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=90: ixxxxx#9 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=90: ixxxxx#9 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=90: ixxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=90: ixxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=90: ixxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=91: ixxxxx#9 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=91: ixxxxx#9 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=91: ixxxxx#9 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=91: ixxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=91: ixxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=91: ixxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=92: ixzxxx#9 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=92: ixzxxx#9 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=92: ixzxxx#9 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=92: ixzxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=92: ixzxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=92: ixzxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=93: vxxxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=93: vxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=93: vxxxxx#9 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=93: vxxxxx#9 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=93: vxxxxx#9 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=93: vxxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=94: vxxxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=94: vxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=94: vxxxxx#9 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=94: vxxxxx#9 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=94: vxxxxx#9 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=94: vxxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=95: sxxxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=95: sxxxxx#9 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=95: sxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=95: sxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=95: sxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=95: sxxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=96: sxxxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=96: sxxxxx#9 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=96: sxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=96: sxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=96: sxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=96: sxxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=97: oxxxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=97: oxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=97: oxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=97: oxxxxx#9 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=97: oxxxxx#9 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=97: oxxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=98: oxxxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=98: oxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=98: oxxxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=98: oxxxxx#9 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=98: oxxxxx#9 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=98: oxxxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=99: oxzxxx#9 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=99: oxzxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=99: oxzxxx#9 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=99: oxzxxx#9 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=99: oxzxxx#9 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=99: oxzxxx#9 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=100: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=100: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=100: ixxxxx#10 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=100: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=100: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=100: ixxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=101: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=101: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=101: ixxxxx#10 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=101: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=101: ixxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=101: ixxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=102: ipzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=102: ipzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=102: ipzxxx#10 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=102: ipzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=102: ipzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=102: ipzxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=103: vxxxxx#10 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=103: vxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=103: vxxxxx#10 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=103: vxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=103: vxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=103: vxxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=104: vxxxxx#10 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=104: vxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=104: vxxxxx#10 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=104: vxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=104: vxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=104: vxxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=105: sxxxxx#10 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=105: sxxxxx#10 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=105: sxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=105: sxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=105: sxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=105: sxxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=106: sxxxxx#10 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=106: sxxxxx#10 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=106: sxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=106: sxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=106: sxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=106: sxxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=107: oxxxxx#10 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=107: oxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=107: oxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=107: oxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=107: oxxxxx#10 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=107: oxxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=108: oxxxxx#10 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=108: oxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=108: oxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=108: oxxxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=108: oxxxxx#10 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=108: oxxxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=109: opzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=109: opzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=109: opzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=109: opzxxx#10 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=109: opzxxx#10 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=109: opzxxx#10 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=110: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=110: ixxxxx#11 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=110: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=110: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=110: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=110: ixxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=111: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=111: ixxxxx#11 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=111: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=111: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=111: ixxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=111: ixxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=112: imzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=112: imzxxx#11 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=112: imzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=112: imzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=112: imzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=112: imzxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=113: vxxxxx#11 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=113: vxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=113: vxxxxx#11 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=113: vxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=113: vxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=113: vxxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=114: vxxxxx#11 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=114: vxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=114: vxxxxx#11 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=114: vxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=114: vxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=114: vxxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=115: sxxxxx#11 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=115: sxxxxx#11 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=115: sxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=115: sxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=115: sxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=115: sxxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=116: sxxxxx#11 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=116: sxxxxx#11 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=116: sxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=116: sxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=116: sxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=116: sxxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=117: oxxxxx#11 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=117: oxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=117: oxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=117: oxxxxx#11 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=117: oxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=117: oxxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=118: oxxxxx#11 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=118: oxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=118: oxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=118: oxxxxx#11 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=118: oxxxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=118: oxxxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=119: omzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=119: omzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=119: omzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=119: omzxxx#11 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=119: omzxxx#11 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=119: omzxxx#11 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=120: ixxxxx#12 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=120: ixxxxx#12 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=120: ixxxxx#12 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=120: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=120: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=120: ixxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=121: ixxxxx#12 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=121: ixxxxx#12 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=121: ixxxxx#12 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=121: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=121: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=121: ixxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=122: ixzxxx#12 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=122: ixzxxx#12 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=122: ixzxxx#12 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=122: ixzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=122: ixzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=122: ixzxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=123: vxxxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=123: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=123: vxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=123: vxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=123: vxxxxx#12 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=123: vxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=124: vxxxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=124: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=124: vxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=124: vxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=124: vxxxxx#12 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=124: vxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=125: sxxxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=125: sxxxxx#12 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=125: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=125: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=125: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=125: sxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=126: sxxxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=126: sxxxxx#12 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=126: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=126: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=126: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=126: sxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=127: oxxxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=127: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=127: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=127: oxxxxx#12 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01,     // itest=127: oxxxxx#12 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=127: oxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=128: oxxxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=128: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=128: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=128: oxxxxx#12 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01,     // itest=128: oxxxxx#12 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=128: oxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=129: oxzxxx#12 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=129: oxzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=129: oxzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=129: oxzxxx#12 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=129: oxzxxx#12 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=129: oxzxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=130: ixxxxx#13 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=130: ixxxxx#13 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=130: ixxxxx#13 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=130: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=130: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=130: ixxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=131: ixxxxx#13 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=131: ixxxxx#13 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=131: ixxxxx#13 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=131: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=131: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=131: ixxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=132: ixzxxx#13 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=132: ixzxxx#13 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=132: ixzxxx#13 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=132: ixzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=132: ixzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=132: ixzxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=133: vxxxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=133: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=133: vxxxxx#13 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=133: vxxxxx#13 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=133: vxxxxx#13 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=133: vxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=134: vxxxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=134: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=134: vxxxxx#13 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=134: vxxxxx#13 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=134: vxxxxx#13 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=134: vxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=135: sxxxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=135: sxxxxx#13 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=135: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=135: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=135: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=135: sxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=136: sxxxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=136: sxxxxx#13 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=136: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=136: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=136: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=136: sxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=137: oxxxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=137: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=137: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=137: oxxxxx#13 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=137: oxxxxx#13 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=137: oxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=138: oxxxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=138: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=138: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=138: oxxxxx#13 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=138: oxxxxx#13 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=138: oxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=139: oxzxxx#13 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=139: oxzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=139: oxzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=139: oxzxxx#13 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=139: oxzxxx#13 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=139: oxzxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=140: ixxxxx#14 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=140: ixxxxx#14 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=140: ixxxxx#14 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=140: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=140: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=140: ixxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=141: ixxxxx#14 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=141: ixxxxx#14 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=141: ixxxxx#14 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=141: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=141: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=141: ixxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=142: ixzxxx#14 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=142: ixzxxx#14 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=142: ixzxxx#14 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=142: ixzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=142: ixzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=142: ixzxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=143: vxxxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=143: vxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=143: vxxxxx#14 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=143: vxxxxx#14 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=143: vxxxxx#14 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=143: vxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=144: vxxxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=144: vxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=144: vxxxxx#14 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=144: vxxxxx#14 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=144: vxxxxx#14 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=144: vxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=145: sxxxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=145: sxxxxx#14 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=145: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=145: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=145: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=145: sxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=146: sxxxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=146: sxxxxx#14 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=146: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=146: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=146: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=146: sxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=147: oxxxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=147: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=147: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=147: oxxxxx#14 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=147: oxxxxx#14 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=147: oxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=148: oxxxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=148: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=148: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=148: oxxxxx#14 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=148: oxxxxx#14 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=148: oxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=149: oxzxxx#14 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=149: oxzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=149: oxzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=149: oxzxxx#14 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=149: oxzxxx#14 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=149: oxzxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=150: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=150: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=150: ixxxxx#15 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=150: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=150: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=150: ixxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=151: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=151: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=151: ixxxxx#15 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=151: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=151: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=151: ixxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=152: ipzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=152: ipzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=152: ipzxxx#15 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=152: ipzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=152: ipzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=152: ipzxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=153: vxxxxx#15 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=153: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=153: vxxxxx#15 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=153: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=153: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=153: vxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=154: vxxxxx#15 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=154: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=154: vxxxxx#15 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=154: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=154: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=154: vxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=155: sxxxxx#15 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=155: sxxxxx#15 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=155: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=155: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=155: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=155: sxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=156: sxxxxx#15 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=156: sxxxxx#15 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=156: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=156: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=156: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=156: sxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=157: oxxxxx#15 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=157: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=157: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=157: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=157: oxxxxx#15 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=157: oxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=158: oxxxxx#15 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=158: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: oxxxxx#15 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=158: oxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=159: opzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=159: opzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=159: opzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=159: opzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=159: opzxxx#15 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=159: opzxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=160: ixxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=160: ixxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=160: ixxxxx#16 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=160: ixxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=160: ixxxxx#16 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=160: ixxxxx#16 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=161: ixxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=161: ixxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=161: ixxxxx#16 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=161: ixxxxx#16 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=161: ixxxxx#16 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=161: ixxxxx#16 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=162: vxxxxx#16 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=162: vxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=162: vxxxxx#16 nsp=-1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=162: vxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=162: vxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=162: vxxxxx#16 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=163: vxxxxx#16 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=163: vxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=163: vxxxxx#16 nsp=-1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=163: vxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=163: vxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=163: vxxxxx#16 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=164: sxxxxx#16 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=164: sxxxxx#16 nsp=-1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=164: sxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=164: sxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=164: sxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=164: sxxxxx#16 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=165: sxxxxx#16 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=165: sxxxxx#16 nsp=-1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=165: sxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=165: sxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=165: sxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=165: sxxxxx#16 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=166: oxxxxx#16 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=166: oxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=166: oxxxxx#16 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=166: oxxxxx#16 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=166: oxxxxx#16 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=166: oxxxxx#16 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=167: oxxxxx#16 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=167: oxxxxx#16 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=167: oxxxxx#16 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=167: oxxxxx#16 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=167: oxxxxx#16 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=167: oxxxxx#16 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=168: ixxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=168: ixxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=168: ixxxxx#17 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=168: ixxxxx#17 nsp=-1 mass=400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=168: ixxxxx#17 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=168: ixxxxx#17 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=169: ixxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=169: ixxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=169: ixxxxx#17 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=169: ixxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=169: ixxxxx#17 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=169: ixxxxx#17 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=170: vxxxxx#17 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=170: vxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=170: vxxxxx#17 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=170: vxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=170: vxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=170: vxxxxx#17 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=171: vxxxxx#17 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=171: vxxxxx#17 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=171: vxxxxx#17 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=171: vxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=171: vxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=171: vxxxxx#17 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=172: sxxxxx#17 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=172: sxxxxx#17 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=172: sxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=172: sxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=172: sxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=172: sxxxxx#17 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=173: sxxxxx#17 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=173: sxxxxx#17 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=173: sxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=173: sxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=173: sxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=173: sxxxxx#17 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=174: oxxxxx#17 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=174: oxxxxx#17 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=174: oxxxxx#17 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=174: oxxxxx#17 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=174: oxxxxx#17 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=174: oxxxxx#17 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=175: oxxxxx#17 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=175: oxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=175: oxxxxx#17 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=175: oxxxxx#17 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=175: oxxxxx#17 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=175: oxxxxx#17 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=176: ixxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=176: ixxxxx#18 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=176: ixxxxx#18 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=176: ixxxxx#18 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=176: ixxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=176: ixxxxx#18 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=177: ixxxxx#18 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=177: ixxxxx#18 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=177: ixxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=177: ixxxxx#18 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=177: ixxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=177: ixxxxx#18 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=178: vxxxxx#18 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=178: vxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=178: vxxxxx#18 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=178: vxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=178: vxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=178: vxxxxx#18 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=179: vxxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=179: vxxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=179: vxxxxx#18 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=179: vxxxxx#18 nsp=-1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=179: vxxxxx#18 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=179: vxxxxx#18 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=180: sxxxxx#18 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=180: sxxxxx#18 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=180: sxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=180: sxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=180: sxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=180: sxxxxx#18 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=181: sxxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=181: sxxxxx#18 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=181: sxxxxx#18 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=181: sxxxxx#18 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=181: sxxxxx#18 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=181: sxxxxx#18 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=182: oxxxxx#18 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=182: oxxxxx#18 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=182: oxxxxx#18 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=182: oxxxxx#18 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=182: oxxxxx#18 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=182: oxxxxx#18 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=183: oxxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=183: oxxxxx#18 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=183: oxxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=183: oxxxxx#18 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=183: oxxxxx#18 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=183: oxxxxx#18 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=184: ixxxxx#19 nsp=-1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=184: ixxxxx#19 nsp=-1 mass=400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=184: ixxxxx#19 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=184: ixxxxx#19 nsp=-1 mass=400
+    -5.999999999999999e+00,  7.999999999999999e+00,     // itest=184: ixxxxx#19 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=184: ixxxxx#19 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=185: ixxxxx#19 nsp=-1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=185: ixxxxx#19 nsp=-1 mass=-400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=185: ixxxxx#19 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=185: ixxxxx#19 nsp=-1 mass=-400
+     5.999999999999999e+00, -7.999999999999999e+00,     // itest=185: ixxxxx#19 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=185: ixxxxx#19 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=186: vxxxxx#19 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=186: vxxxxx#19 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=186: vxxxxx#19 nsp=-1 mass=400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=186: vxxxxx#19 nsp=-1 mass=400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=186: vxxxxx#19 nsp=-1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=186: vxxxxx#19 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=187: vxxxxx#19 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=187: vxxxxx#19 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=187: vxxxxx#19 nsp=-1 mass=-400
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=187: vxxxxx#19 nsp=-1 mass=-400
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=187: vxxxxx#19 nsp=-1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=187: vxxxxx#19 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=188: sxxxxx#19 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=188: sxxxxx#19 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=188: sxxxxx#19 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=188: sxxxxx#19 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=188: sxxxxx#19 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=188: sxxxxx#19 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=189: sxxxxx#19 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=189: sxxxxx#19 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=189: sxxxxx#19 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=189: sxxxxx#19 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=189: sxxxxx#19 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=189: sxxxxx#19 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=190: oxxxxx#19 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=190: oxxxxx#19 nsp=-1 mass=400
+    -5.999999999999999e+00, -7.999999999999999e+00,     // itest=190: oxxxxx#19 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=190: oxxxxx#19 nsp=-1 mass=400
+     1.200000000000000e+01,  1.600000000000000e+01,     // itest=190: oxxxxx#19 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=190: oxxxxx#19 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=191: oxxxxx#19 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=191: oxxxxx#19 nsp=-1 mass=-400
+     5.999999999999999e+00,  7.999999999999999e+00,     // itest=191: oxxxxx#19 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=191: oxxxxx#19 nsp=-1 mass=-400
+     1.200000000000000e+01,  1.600000000000000e+01,     // itest=191: oxxxxx#19 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=191: oxxxxx#19 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=192: ixxxxx#20 nsp=-1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=192: ixxxxx#20 nsp=-1 mass=400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=192: ixxxxx#20 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=192: ixxxxx#20 nsp=-1 mass=400
+     7.999999999999999e+00, -5.999999999999999e+00,     // itest=192: ixxxxx#20 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=192: ixxxxx#20 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=193: ixxxxx#20 nsp=-1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=193: ixxxxx#20 nsp=-1 mass=-400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=193: ixxxxx#20 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=193: ixxxxx#20 nsp=-1 mass=-400
+    -7.999999999999999e+00,  5.999999999999999e+00,     // itest=193: ixxxxx#20 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=193: ixxxxx#20 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=194: vxxxxx#20 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=194: vxxxxx#20 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=194: vxxxxx#20 nsp=-1 mass=400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=194: vxxxxx#20 nsp=-1 mass=400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=194: vxxxxx#20 nsp=-1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=194: vxxxxx#20 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=195: vxxxxx#20 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=195: vxxxxx#20 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=195: vxxxxx#20 nsp=-1 mass=-400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=195: vxxxxx#20 nsp=-1 mass=-400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=195: vxxxxx#20 nsp=-1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=195: vxxxxx#20 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=196: sxxxxx#20 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=196: sxxxxx#20 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=196: sxxxxx#20 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=196: sxxxxx#20 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=196: sxxxxx#20 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=196: sxxxxx#20 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=197: sxxxxx#20 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=197: sxxxxx#20 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=197: sxxxxx#20 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=197: sxxxxx#20 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=197: sxxxxx#20 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=197: sxxxxx#20 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=198: oxxxxx#20 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=198: oxxxxx#20 nsp=-1 mass=400
+     7.999999999999999e+00,  5.999999999999999e+00,     // itest=198: oxxxxx#20 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=198: oxxxxx#20 nsp=-1 mass=400
+    -1.600000000000000e+01, -1.200000000000000e+01,     // itest=198: oxxxxx#20 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=198: oxxxxx#20 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=199: oxxxxx#20 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=199: oxxxxx#20 nsp=-1 mass=-400
+    -7.999999999999999e+00, -5.999999999999999e+00,     // itest=199: oxxxxx#20 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=199: oxxxxx#20 nsp=-1 mass=-400
+    -1.600000000000000e+01, -1.200000000000000e+01,     // itest=199: oxxxxx#20 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=199: oxxxxx#20 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=200: ixxxxx#21 nsp=-1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=200: ixxxxx#21 nsp=-1 mass=400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=200: ixxxxx#21 nsp=-1 mass=400
+    -2.433105012119288e+01, -0.000000000000000e+00,     // itest=200: ixxxxx#21 nsp=-1 mass=400
+    -4.931969619160719e+00,  5.260767593771432e+00,     // itest=200: ixxxxx#21 nsp=-1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00 } ); // itest=200: ixxxxx#21 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=201: ixxxxx#21 nsp=-1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=201: ixxxxx#21 nsp=-1 mass=-400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=201: ixxxxx#21 nsp=-1 mass=-400
+    -2.433105012119288e+01, -0.000000000000000e+00,     // itest=201: ixxxxx#21 nsp=-1 mass=-400
+     4.931969619160719e+00, -5.260767593771432e+00,     // itest=201: ixxxxx#21 nsp=-1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=201: ixxxxx#21 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=202: vxxxxx#21 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=202: vxxxxx#21 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=202: vxxxxx#21 nsp=-1 mass=400
+    -2.321373168788980e-01,  5.158607041753289e-01,     // itest=202: vxxxxx#21 nsp=-1 mass=400
+    -2.476131380041579e-01, -4.836194101643708e-01,     // itest=202: vxxxxx#21 nsp=-1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=202: vxxxxx#21 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=203: vxxxxx#21 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=203: vxxxxx#21 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=203: vxxxxx#21 nsp=-1 mass=-400
+    -2.321373168788980e-01,  5.158607041753289e-01,     // itest=203: vxxxxx#21 nsp=-1 mass=-400
+    -2.476131380041579e-01, -4.836194101643708e-01,     // itest=203: vxxxxx#21 nsp=-1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=203: vxxxxx#21 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=204: sxxxxx#21 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=204: sxxxxx#21 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=204: sxxxxx#21 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=204: sxxxxx#21 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=204: sxxxxx#21 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=204: sxxxxx#21 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=205: sxxxxx#21 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=205: sxxxxx#21 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=205: sxxxxx#21 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=205: sxxxxx#21 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=205: sxxxxx#21 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=205: sxxxxx#21 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=206: oxxxxx#21 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=206: oxxxxx#21 nsp=-1 mass=400
+    -4.931969619160719e+00, -5.260767593771432e+00,     // itest=206: oxxxxx#21 nsp=-1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=206: oxxxxx#21 nsp=-1 mass=400
+     9.863939238321439e+00,  1.052153518754287e+01,     // itest=206: oxxxxx#21 nsp=-1 mass=400
+    -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=206: oxxxxx#21 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=207: oxxxxx#21 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=207: oxxxxx#21 nsp=-1 mass=-400
+     4.931969619160719e+00,  5.260767593771432e+00,     // itest=207: oxxxxx#21 nsp=-1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=207: oxxxxx#21 nsp=-1 mass=-400
+     9.863939238321439e+00,  1.052153518754287e+01,     // itest=207: oxxxxx#21 nsp=-1 mass=-400
+    -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=207: oxxxxx#21 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=208: ixxxxx#22 nsp=-1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=208: ixxxxx#22 nsp=-1 mass=400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=208: ixxxxx#22 nsp=-1 mass=400
+    -1.442220510185596e+01, -0.000000000000000e+00,     // itest=208: ixxxxx#22 nsp=-1 mass=400
+    -8.320502943378436e+00,  8.875203139603666e+00,     // itest=208: ixxxxx#22 nsp=-1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00 } ); // itest=208: ixxxxx#22 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=209: ixxxxx#22 nsp=-1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=209: ixxxxx#22 nsp=-1 mass=-400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=209: ixxxxx#22 nsp=-1 mass=-400
+    -1.442220510185596e+01, -0.000000000000000e+00,     // itest=209: ixxxxx#22 nsp=-1 mass=-400
+     8.320502943378436e+00, -8.875203139603666e+00,     // itest=209: ixxxxx#22 nsp=-1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=209: ixxxxx#22 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=210: vxxxxx#22 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=210: vxxxxx#22 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=210: vxxxxx#22 nsp=-1 mass=400
+     2.321373168788980e-01,  5.158607041753289e-01,     // itest=210: vxxxxx#22 nsp=-1 mass=400
+     2.476131380041579e-01, -4.836194101643708e-01,     // itest=210: vxxxxx#22 nsp=-1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=210: vxxxxx#22 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=211: vxxxxx#22 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=211: vxxxxx#22 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=211: vxxxxx#22 nsp=-1 mass=-400
+     2.321373168788980e-01,  5.158607041753289e-01,     // itest=211: vxxxxx#22 nsp=-1 mass=-400
+     2.476131380041579e-01, -4.836194101643708e-01,     // itest=211: vxxxxx#22 nsp=-1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=211: vxxxxx#22 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=212: sxxxxx#22 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=212: sxxxxx#22 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=212: sxxxxx#22 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=212: sxxxxx#22 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=212: sxxxxx#22 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=212: sxxxxx#22 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=213: sxxxxx#22 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=213: sxxxxx#22 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=213: sxxxxx#22 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=213: sxxxxx#22 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=213: sxxxxx#22 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=213: sxxxxx#22 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=214: oxxxxx#22 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=214: oxxxxx#22 nsp=-1 mass=400
+    -8.320502943378436e+00, -8.875203139603666e+00,     // itest=214: oxxxxx#22 nsp=-1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=214: oxxxxx#22 nsp=-1 mass=400
+     1.664100588675688e+01,  1.775040627920733e+01,     // itest=214: oxxxxx#22 nsp=-1 mass=400
+    -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=214: oxxxxx#22 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=215: oxxxxx#22 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=215: oxxxxx#22 nsp=-1 mass=-400
+     8.320502943378436e+00,  8.875203139603666e+00,     // itest=215: oxxxxx#22 nsp=-1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=215: oxxxxx#22 nsp=-1 mass=-400
+     1.664100588675688e+01,  1.775040627920733e+01,     // itest=215: oxxxxx#22 nsp=-1 mass=-400
+    -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=215: oxxxxx#22 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=216: ixxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=216: ixxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=216: ixxxxx#23 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=216: ixxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=216: ixxxxx#23 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=216: ixxxxx#23 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=217: ixxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=217: ixxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=217: ixxxxx#23 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=217: ixxxxx#23 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=217: ixxxxx#23 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=217: ixxxxx#23 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=218: vxxxxx#23 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=218: vxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=218: vxxxxx#23 nsp=-1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=218: vxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=218: vxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=218: vxxxxx#23 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=219: vxxxxx#23 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=219: vxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=219: vxxxxx#23 nsp=-1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=219: vxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=219: vxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=219: vxxxxx#23 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=220: sxxxxx#23 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=220: sxxxxx#23 nsp=-1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=220: sxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=220: sxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=220: sxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=220: sxxxxx#23 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=221: sxxxxx#23 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=221: sxxxxx#23 nsp=-1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=221: sxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=221: sxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=221: sxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=221: sxxxxx#23 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=222: oxxxxx#23 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=222: oxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=222: oxxxxx#23 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=222: oxxxxx#23 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=222: oxxxxx#23 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=222: oxxxxx#23 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=223: oxxxxx#23 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=223: oxxxxx#23 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=223: oxxxxx#23 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=223: oxxxxx#23 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=223: oxxxxx#23 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=223: oxxxxx#23 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=224: ixxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=224: ixxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=224: ixxxxx#24 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=224: ixxxxx#24 nsp=-1 mass=400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=224: ixxxxx#24 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=224: ixxxxx#24 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=225: ixxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=225: ixxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=225: ixxxxx#24 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=225: ixxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=225: ixxxxx#24 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=225: ixxxxx#24 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=226: vxxxxx#24 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=226: vxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=226: vxxxxx#24 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=226: vxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=226: vxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=226: vxxxxx#24 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=227: vxxxxx#24 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=227: vxxxxx#24 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=227: vxxxxx#24 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=227: vxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=227: vxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=227: vxxxxx#24 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=228: sxxxxx#24 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=228: sxxxxx#24 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=228: sxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=228: sxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=228: sxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=228: sxxxxx#24 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=229: sxxxxx#24 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=229: sxxxxx#24 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=229: sxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=229: sxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=229: sxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=229: sxxxxx#24 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=230: oxxxxx#24 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=230: oxxxxx#24 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=230: oxxxxx#24 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=230: oxxxxx#24 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=230: oxxxxx#24 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=230: oxxxxx#24 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=231: oxxxxx#24 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=231: oxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=231: oxxxxx#24 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=231: oxxxxx#24 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=231: oxxxxx#24 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=231: oxxxxx#24 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=232: ixxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=232: ixxxxx#25 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=232: ixxxxx#25 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=232: ixxxxx#25 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=232: ixxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=232: ixxxxx#25 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=233: ixxxxx#25 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=233: ixxxxx#25 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=233: ixxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=233: ixxxxx#25 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=233: ixxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=233: ixxxxx#25 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=234: vxxxxx#25 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=234: vxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=234: vxxxxx#25 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=234: vxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=234: vxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=234: vxxxxx#25 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=235: vxxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=235: vxxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=235: vxxxxx#25 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=235: vxxxxx#25 nsp=-1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=235: vxxxxx#25 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=235: vxxxxx#25 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=236: sxxxxx#25 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=236: sxxxxx#25 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=236: sxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=236: sxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=236: sxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=236: sxxxxx#25 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=237: sxxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=237: sxxxxx#25 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=237: sxxxxx#25 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=237: sxxxxx#25 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=237: sxxxxx#25 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=237: sxxxxx#25 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=238: oxxxxx#25 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=238: oxxxxx#25 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=238: oxxxxx#25 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=238: oxxxxx#25 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=238: oxxxxx#25 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=238: oxxxxx#25 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=239: oxxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=239: oxxxxx#25 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=239: oxxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=239: oxxxxx#25 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=239: oxxxxx#25 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=239: oxxxxx#25 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=240: ixxxxx#26 nsp=-1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=240: ixxxxx#26 nsp=-1 mass=400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=240: ixxxxx#26 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=240: ixxxxx#26 nsp=-1 mass=400
+    -5.999999999999999e+00,  7.999999999999999e+00,     // itest=240: ixxxxx#26 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=240: ixxxxx#26 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=241: ixxxxx#26 nsp=-1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=241: ixxxxx#26 nsp=-1 mass=-400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=241: ixxxxx#26 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=241: ixxxxx#26 nsp=-1 mass=-400
+     5.999999999999999e+00, -7.999999999999999e+00,     // itest=241: ixxxxx#26 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=241: ixxxxx#26 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=242: vxxxxx#26 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=242: vxxxxx#26 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=242: vxxxxx#26 nsp=-1 mass=400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=242: vxxxxx#26 nsp=-1 mass=400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=242: vxxxxx#26 nsp=-1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=242: vxxxxx#26 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=243: vxxxxx#26 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=243: vxxxxx#26 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=243: vxxxxx#26 nsp=-1 mass=-400
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=243: vxxxxx#26 nsp=-1 mass=-400
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=243: vxxxxx#26 nsp=-1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=243: vxxxxx#26 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=244: sxxxxx#26 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=244: sxxxxx#26 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=244: sxxxxx#26 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=244: sxxxxx#26 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=244: sxxxxx#26 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=244: sxxxxx#26 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=245: sxxxxx#26 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=245: sxxxxx#26 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=245: sxxxxx#26 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=245: sxxxxx#26 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=245: sxxxxx#26 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=245: sxxxxx#26 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=246: oxxxxx#26 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=246: oxxxxx#26 nsp=-1 mass=400
+    -5.999999999999999e+00, -7.999999999999999e+00,     // itest=246: oxxxxx#26 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=246: oxxxxx#26 nsp=-1 mass=400
+     1.200000000000000e+01,  1.600000000000000e+01,     // itest=246: oxxxxx#26 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=246: oxxxxx#26 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=247: oxxxxx#26 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=247: oxxxxx#26 nsp=-1 mass=-400
+     5.999999999999999e+00,  7.999999999999999e+00,     // itest=247: oxxxxx#26 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=247: oxxxxx#26 nsp=-1 mass=-400
+     1.200000000000000e+01,  1.600000000000000e+01,     // itest=247: oxxxxx#26 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=247: oxxxxx#26 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=248: ixxxxx#27 nsp=-1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=248: ixxxxx#27 nsp=-1 mass=400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=248: ixxxxx#27 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=248: ixxxxx#27 nsp=-1 mass=400
+     7.999999999999999e+00, -5.999999999999999e+00,     // itest=248: ixxxxx#27 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=248: ixxxxx#27 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=249: ixxxxx#27 nsp=-1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=249: ixxxxx#27 nsp=-1 mass=-400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=249: ixxxxx#27 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=249: ixxxxx#27 nsp=-1 mass=-400
+    -7.999999999999999e+00,  5.999999999999999e+00,     // itest=249: ixxxxx#27 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=249: ixxxxx#27 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=250: vxxxxx#27 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=250: vxxxxx#27 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=250: vxxxxx#27 nsp=-1 mass=400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=250: vxxxxx#27 nsp=-1 mass=400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=250: vxxxxx#27 nsp=-1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=250: vxxxxx#27 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=251: vxxxxx#27 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=251: vxxxxx#27 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=251: vxxxxx#27 nsp=-1 mass=-400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=251: vxxxxx#27 nsp=-1 mass=-400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=251: vxxxxx#27 nsp=-1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=251: vxxxxx#27 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=252: sxxxxx#27 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=252: sxxxxx#27 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=252: sxxxxx#27 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=252: sxxxxx#27 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=252: sxxxxx#27 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=252: sxxxxx#27 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=253: sxxxxx#27 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=253: sxxxxx#27 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=253: sxxxxx#27 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=253: sxxxxx#27 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=253: sxxxxx#27 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=253: sxxxxx#27 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=254: oxxxxx#27 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=254: oxxxxx#27 nsp=-1 mass=400
+     7.999999999999999e+00,  5.999999999999999e+00,     // itest=254: oxxxxx#27 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=254: oxxxxx#27 nsp=-1 mass=400
+    -1.600000000000000e+01, -1.200000000000000e+01,     // itest=254: oxxxxx#27 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=254: oxxxxx#27 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=255: oxxxxx#27 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=255: oxxxxx#27 nsp=-1 mass=-400
+    -7.999999999999999e+00, -5.999999999999999e+00,     // itest=255: oxxxxx#27 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=255: oxxxxx#27 nsp=-1 mass=-400
+    -1.600000000000000e+01, -1.200000000000000e+01,     // itest=255: oxxxxx#27 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=255: oxxxxx#27 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=256: ixxxxx#28 nsp=-1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=256: ixxxxx#28 nsp=-1 mass=400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=256: ixxxxx#28 nsp=-1 mass=400
+    -2.433105012119288e+01, -0.000000000000000e+00,     // itest=256: ixxxxx#28 nsp=-1 mass=400
+    -4.931969619160719e+00,  5.260767593771432e+00,     // itest=256: ixxxxx#28 nsp=-1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00 } ); // itest=256: ixxxxx#28 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=257: ixxxxx#28 nsp=-1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=257: ixxxxx#28 nsp=-1 mass=-400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=257: ixxxxx#28 nsp=-1 mass=-400
+    -2.433105012119288e+01, -0.000000000000000e+00,     // itest=257: ixxxxx#28 nsp=-1 mass=-400
+     4.931969619160719e+00, -5.260767593771432e+00,     // itest=257: ixxxxx#28 nsp=-1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=257: ixxxxx#28 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=258: vxxxxx#28 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=258: vxxxxx#28 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=258: vxxxxx#28 nsp=-1 mass=400
+    -2.321373168788980e-01,  5.158607041753289e-01,     // itest=258: vxxxxx#28 nsp=-1 mass=400
+    -2.476131380041579e-01, -4.836194101643708e-01,     // itest=258: vxxxxx#28 nsp=-1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=258: vxxxxx#28 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=259: vxxxxx#28 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=259: vxxxxx#28 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=259: vxxxxx#28 nsp=-1 mass=-400
+    -2.321373168788980e-01,  5.158607041753289e-01,     // itest=259: vxxxxx#28 nsp=-1 mass=-400
+    -2.476131380041579e-01, -4.836194101643708e-01,     // itest=259: vxxxxx#28 nsp=-1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=259: vxxxxx#28 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=260: sxxxxx#28 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=260: sxxxxx#28 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=260: sxxxxx#28 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=260: sxxxxx#28 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=260: sxxxxx#28 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=260: sxxxxx#28 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=261: sxxxxx#28 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=261: sxxxxx#28 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=261: sxxxxx#28 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=261: sxxxxx#28 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=261: sxxxxx#28 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=261: sxxxxx#28 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=262: oxxxxx#28 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=262: oxxxxx#28 nsp=-1 mass=400
+    -4.931969619160719e+00, -5.260767593771432e+00,     // itest=262: oxxxxx#28 nsp=-1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=262: oxxxxx#28 nsp=-1 mass=400
+     9.863939238321439e+00,  1.052153518754287e+01,     // itest=262: oxxxxx#28 nsp=-1 mass=400
+    -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=262: oxxxxx#28 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=263: oxxxxx#28 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=263: oxxxxx#28 nsp=-1 mass=-400
+     4.931969619160719e+00,  5.260767593771432e+00,     // itest=263: oxxxxx#28 nsp=-1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=263: oxxxxx#28 nsp=-1 mass=-400
+     9.863939238321439e+00,  1.052153518754287e+01,     // itest=263: oxxxxx#28 nsp=-1 mass=-400
+    -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=263: oxxxxx#28 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=264: ixxxxx#29 nsp=-1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=264: ixxxxx#29 nsp=-1 mass=400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=264: ixxxxx#29 nsp=-1 mass=400
+    -1.442220510185596e+01, -0.000000000000000e+00,     // itest=264: ixxxxx#29 nsp=-1 mass=400
+    -8.320502943378436e+00,  8.875203139603666e+00,     // itest=264: ixxxxx#29 nsp=-1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00 } ); // itest=264: ixxxxx#29 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=265: ixxxxx#29 nsp=-1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=265: ixxxxx#29 nsp=-1 mass=-400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=265: ixxxxx#29 nsp=-1 mass=-400
+    -1.442220510185596e+01, -0.000000000000000e+00,     // itest=265: ixxxxx#29 nsp=-1 mass=-400
+     8.320502943378436e+00, -8.875203139603666e+00,     // itest=265: ixxxxx#29 nsp=-1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=265: ixxxxx#29 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=266: vxxxxx#29 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=266: vxxxxx#29 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=266: vxxxxx#29 nsp=-1 mass=400
+     2.321373168788980e-01,  5.158607041753289e-01,     // itest=266: vxxxxx#29 nsp=-1 mass=400
+     2.476131380041579e-01, -4.836194101643708e-01,     // itest=266: vxxxxx#29 nsp=-1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=266: vxxxxx#29 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=267: vxxxxx#29 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=267: vxxxxx#29 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=267: vxxxxx#29 nsp=-1 mass=-400
+     2.321373168788980e-01,  5.158607041753289e-01,     // itest=267: vxxxxx#29 nsp=-1 mass=-400
+     2.476131380041579e-01, -4.836194101643708e-01,     // itest=267: vxxxxx#29 nsp=-1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=267: vxxxxx#29 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=268: sxxxxx#29 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=268: sxxxxx#29 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#29 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#29 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#29 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=268: sxxxxx#29 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=269: sxxxxx#29 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=269: sxxxxx#29 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=269: sxxxxx#29 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=269: sxxxxx#29 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=269: sxxxxx#29 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=269: sxxxxx#29 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=270: oxxxxx#29 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=270: oxxxxx#29 nsp=-1 mass=400
+    -8.320502943378436e+00, -8.875203139603666e+00,     // itest=270: oxxxxx#29 nsp=-1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=270: oxxxxx#29 nsp=-1 mass=400
+     1.664100588675688e+01,  1.775040627920733e+01,     // itest=270: oxxxxx#29 nsp=-1 mass=400
+    -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=270: oxxxxx#29 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=271: oxxxxx#29 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=271: oxxxxx#29 nsp=-1 mass=-400
+     8.320502943378436e+00,  8.875203139603666e+00,     // itest=271: oxxxxx#29 nsp=-1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=271: oxxxxx#29 nsp=-1 mass=-400
+     1.664100588675688e+01,  1.775040627920733e+01,     // itest=271: oxxxxx#29 nsp=-1 mass=-400
+    -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=271: oxxxxx#29 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=272: ixxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=272: ixxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=272: ixxxxx#30 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=272: ixxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=272: ixxxxx#30 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=272: ixxxxx#30 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=273: ixxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=273: ixxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=273: ixxxxx#30 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=273: ixxxxx#30 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=273: ixxxxx#30 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=273: ixxxxx#30 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=274: vxxxxx#30 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=274: vxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=274: vxxxxx#30 nsp=-1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=274: vxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=274: vxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=274: vxxxxx#30 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=275: vxxxxx#30 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=275: vxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=275: vxxxxx#30 nsp=-1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=275: vxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=275: vxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=275: vxxxxx#30 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=276: sxxxxx#30 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=276: sxxxxx#30 nsp=-1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=276: sxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=276: sxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=276: sxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=276: sxxxxx#30 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=277: sxxxxx#30 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=277: sxxxxx#30 nsp=-1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=277: sxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=277: sxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=277: sxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=277: sxxxxx#30 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=278: oxxxxx#30 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=278: oxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=278: oxxxxx#30 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=278: oxxxxx#30 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=278: oxxxxx#30 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=278: oxxxxx#30 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=279: oxxxxx#30 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=279: oxxxxx#30 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=279: oxxxxx#30 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=279: oxxxxx#30 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=279: oxxxxx#30 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=279: oxxxxx#30 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=280: ixxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=280: ixxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=280: ixxxxx#31 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=280: ixxxxx#31 nsp=-1 mass=400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=280: ixxxxx#31 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=280: ixxxxx#31 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=281: ixxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=281: ixxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=281: ixxxxx#31 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=281: ixxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=281: ixxxxx#31 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=281: ixxxxx#31 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=282: vxxxxx#31 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=282: vxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=282: vxxxxx#31 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=282: vxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=282: vxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=282: vxxxxx#31 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=283: vxxxxx#31 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=283: vxxxxx#31 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=283: vxxxxx#31 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=283: vxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=283: vxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=283: vxxxxx#31 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=284: sxxxxx#31 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=284: sxxxxx#31 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=284: sxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=284: sxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=284: sxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=284: sxxxxx#31 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=285: sxxxxx#31 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=285: sxxxxx#31 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=285: sxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=285: sxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=285: sxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=285: sxxxxx#31 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=286: oxxxxx#31 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=286: oxxxxx#31 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=286: oxxxxx#31 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=286: oxxxxx#31 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=286: oxxxxx#31 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=286: oxxxxx#31 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=287: oxxxxx#31 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=287: oxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=287: oxxxxx#31 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=287: oxxxxx#31 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=287: oxxxxx#31 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=287: oxxxxx#31 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=288: ixxxxx#0 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=288: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=288: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=288: ixxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=288: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=288: ixxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=289: ixxxxx#0 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=289: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=289: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=289: ixxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=289: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=289: ixxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=290: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=290: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=290: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=290: ipzxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=290: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=290: ipzxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=291: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=291: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=291: vxxxxx#0 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=291: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=291: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=291: vxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=292: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=292: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=292: vxxxxx#0 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=292: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=292: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=292: vxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=293: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=293: sxxxxx#0 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=293: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=293: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=293: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=293: sxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=294: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=294: sxxxxx#0 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=294: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=294: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=294: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=294: sxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=295: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=295: oxxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=295: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=295: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=295: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=295: oxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=296: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=296: oxxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=296: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=296: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=296: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=296: oxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=297: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=297: opzxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=297: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=297: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=297: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=297: opzxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=298: ixxxxx#1 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=298: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=298: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=298: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=298: ixxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=298: ixxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=299: ixxxxx#1 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=299: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=299: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=299: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=299: ixxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=299: ixxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=300: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=300: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=300: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=300: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=300: imzxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=300: imzxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=301: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=301: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=301: vxxxxx#1 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=301: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=301: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=301: vxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=302: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=302: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=302: vxxxxx#1 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=302: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=302: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=302: vxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=303: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=303: sxxxxx#1 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=303: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=303: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=303: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=303: sxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=304: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=304: sxxxxx#1 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=304: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=304: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=304: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=304: sxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=305: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=305: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=305: oxxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=305: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=305: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=305: oxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=306: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=306: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=306: oxxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=306: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=306: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=306: oxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=307: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=307: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=307: omzxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=307: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=307: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=307: omzxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=308: ixxxxx#2 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=308: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=308: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=308: ixxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=308: ixxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01 } ); // itest=308: ixxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=309: ixxxxx#2 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=309: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=309: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=309: ixxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=309: ixxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01 } ); // itest=309: ixxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=310: ixzxxx#2 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=310: ixzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=310: ixzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=310: ixzxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=310: ixzxxx#2 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=310: ixzxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=311: vxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=311: vxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=311: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=311: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=311: vxxxxx#2 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=311: vxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=312: vxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=312: vxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=312: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=312: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=312: vxxxxx#2 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=312: vxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=313: sxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=313: sxxxxx#2 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=313: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=313: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=313: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=313: sxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=314: sxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=314: sxxxxx#2 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=314: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=314: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=314: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=314: sxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=315: oxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=315: oxxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=315: oxxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=315: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=315: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=315: oxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=316: oxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=316: oxxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=316: oxxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=316: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=316: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=316: oxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=317: oxzxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=317: oxzxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=317: oxzxxx#2 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=317: oxzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=317: oxzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=317: oxzxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=318: ixxxxx#3 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=318: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=318: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=318: ixxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=318: ixxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=318: ixxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=319: ixxxxx#3 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=319: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=319: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=319: ixxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=319: ixxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=319: ixxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=320: ixzxxx#3 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=320: ixzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=320: ixzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=320: ixzxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=320: ixzxxx#3 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=320: ixzxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=321: vxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=321: vxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=321: vxxxxx#3 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=321: vxxxxx#3 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=321: vxxxxx#3 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=321: vxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=322: vxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=322: vxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=322: vxxxxx#3 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=322: vxxxxx#3 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=322: vxxxxx#3 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=322: vxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=323: sxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=323: sxxxxx#3 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=323: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=323: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=323: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=323: sxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=324: sxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=324: sxxxxx#3 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=324: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=324: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=324: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=324: sxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=325: oxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=325: oxxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=325: oxxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=325: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=325: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=325: oxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=326: oxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=326: oxxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=326: oxxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=326: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=326: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=326: oxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=327: oxzxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=327: oxzxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=327: oxzxxx#3 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=327: oxzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=327: oxzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=327: oxzxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=328: ixxxxx#4 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=328: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=328: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=328: ixxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=328: ixxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=328: ixxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=329: ixxxxx#4 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=329: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=329: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=329: ixxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=329: ixxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=329: ixxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=330: ixzxxx#4 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=330: ixzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=330: ixzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=330: ixzxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=330: ixzxxx#4 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=330: ixzxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=331: vxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=331: vxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=331: vxxxxx#4 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=331: vxxxxx#4 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=331: vxxxxx#4 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=331: vxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=332: vxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=332: vxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=332: vxxxxx#4 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=332: vxxxxx#4 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=332: vxxxxx#4 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=332: vxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=333: sxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=333: sxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=333: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=333: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=333: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=333: sxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=334: sxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=334: sxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=334: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=334: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=334: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=334: sxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=335: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=335: oxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=335: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=335: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=335: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=335: oxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=336: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=336: oxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=336: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=336: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=336: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=336: oxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=337: oxzxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=337: oxzxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=337: oxzxxx#4 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=337: oxzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=337: oxzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=337: oxzxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=338: ixxxxx#5 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=338: ixxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=338: ixxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=338: ixxxxx#5 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=338: ixxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=338: ixxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=339: ixxxxx#5 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=339: ixxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=339: ixxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=339: ixxxxx#5 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=339: ixxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=339: ixxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=340: ipzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=340: ipzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=340: ipzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=340: ipzxxx#5 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=340: ipzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=340: ipzxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=341: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=341: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=341: vxxxxx#5 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=341: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=341: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=341: vxxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=342: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=342: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=342: vxxxxx#5 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=342: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=342: vxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=342: vxxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=343: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=343: sxxxxx#5 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=343: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=343: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=343: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=343: sxxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=344: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=344: sxxxxx#5 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=344: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=344: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=344: sxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=344: sxxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=345: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=345: oxxxxx#5 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=345: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=345: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=345: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=345: oxxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=346: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=346: oxxxxx#5 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=346: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=346: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=346: oxxxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=346: oxxxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=347: opzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=347: opzxxx#5 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=347: opzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=347: opzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=347: opzxxx#5 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=347: opzxxx#5 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=348: ixxxxx#6 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=348: ixxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=348: ixxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=348: ixxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=348: ixxxxx#6 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=348: ixxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=349: ixxxxx#6 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=349: ixxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=349: ixxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=349: ixxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=349: ixxxxx#6 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=349: ixxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=350: imzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=350: imzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=350: imzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=350: imzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=350: imzxxx#6 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=350: imzxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=351: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=351: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=351: vxxxxx#6 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=351: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=351: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=351: vxxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=352: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=352: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=352: vxxxxx#6 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=352: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=352: vxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=352: vxxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=353: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=353: sxxxxx#6 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=353: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=353: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=353: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=353: sxxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=354: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=354: sxxxxx#6 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=354: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=354: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=354: sxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=354: sxxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=355: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=355: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=355: oxxxxx#6 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=355: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=355: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=355: oxxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=356: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=356: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=356: oxxxxx#6 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=356: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=356: oxxxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=356: oxxxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=357: omzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=357: omzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=357: omzxxx#6 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=357: omzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=357: omzxxx#6 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=357: omzxxx#6 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=358: ixxxxx#7 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=358: ixxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=358: ixxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=358: ixxxxx#7 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=358: ixxxxx#7 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01 } ); // itest=358: ixxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=359: ixxxxx#7 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=359: ixxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=359: ixxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=359: ixxxxx#7 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=359: ixxxxx#7 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01 } ); // itest=359: ixxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=360: ixzxxx#7 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=360: ixzxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=360: ixzxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=360: ixzxxx#7 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=360: ixzxxx#7 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=360: ixzxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=361: vxxxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=361: vxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=361: vxxxxx#7 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=361: vxxxxx#7 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=361: vxxxxx#7 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=361: vxxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=362: vxxxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=362: vxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=362: vxxxxx#7 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=362: vxxxxx#7 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=362: vxxxxx#7 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=362: vxxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=363: sxxxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=363: sxxxxx#7 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=363: sxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=363: sxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=363: sxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=363: sxxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=364: sxxxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=364: sxxxxx#7 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=364: sxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=364: sxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=364: sxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=364: sxxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=365: oxxxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=365: oxxxxx#7 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=365: oxxxxx#7 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=365: oxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=365: oxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=365: oxxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=366: oxxxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=366: oxxxxx#7 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=366: oxxxxx#7 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=366: oxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=366: oxxxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=366: oxxxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=367: oxzxxx#7 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=367: oxzxxx#7 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=367: oxzxxx#7 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=367: oxzxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=367: oxzxxx#7 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=367: oxzxxx#7 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=368: ixxxxx#8 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=368: ixxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=368: ixxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=368: ixxxxx#8 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=368: ixxxxx#8 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=368: ixxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=369: ixxxxx#8 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=369: ixxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=369: ixxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=369: ixxxxx#8 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=369: ixxxxx#8 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=369: ixxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=370: ixzxxx#8 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=370: ixzxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=370: ixzxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=370: ixzxxx#8 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=370: ixzxxx#8 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=370: ixzxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=371: vxxxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=371: vxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=371: vxxxxx#8 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=371: vxxxxx#8 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=371: vxxxxx#8 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=371: vxxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=372: vxxxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=372: vxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=372: vxxxxx#8 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=372: vxxxxx#8 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=372: vxxxxx#8 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=372: vxxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=373: sxxxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=373: sxxxxx#8 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=373: sxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=373: sxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=373: sxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=373: sxxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=374: sxxxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=374: sxxxxx#8 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=374: sxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=374: sxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=374: sxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=374: sxxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=375: oxxxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=375: oxxxxx#8 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=375: oxxxxx#8 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=375: oxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=375: oxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=375: oxxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=376: oxxxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=376: oxxxxx#8 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=376: oxxxxx#8 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=376: oxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=376: oxxxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=376: oxxxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=377: oxzxxx#8 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=377: oxzxxx#8 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=377: oxzxxx#8 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=377: oxzxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=377: oxzxxx#8 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=377: oxzxxx#8 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=378: ixxxxx#9 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=378: ixxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=378: ixxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=378: ixxxxx#9 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=378: ixxxxx#9 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=378: ixxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=379: ixxxxx#9 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=379: ixxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=379: ixxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=379: ixxxxx#9 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=379: ixxxxx#9 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=379: ixxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=380: ixzxxx#9 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=380: ixzxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=380: ixzxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=380: ixzxxx#9 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=380: ixzxxx#9 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=380: ixzxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=381: vxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=381: vxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=381: vxxxxx#9 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=381: vxxxxx#9 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=381: vxxxxx#9 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=381: vxxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=382: vxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=382: vxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=382: vxxxxx#9 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=382: vxxxxx#9 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=382: vxxxxx#9 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=382: vxxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=383: sxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=383: sxxxxx#9 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=383: sxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=383: sxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=383: sxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=383: sxxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=384: sxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=384: sxxxxx#9 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=384: sxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=384: sxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=384: sxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=384: sxxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=385: oxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=385: oxxxxx#9 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=385: oxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=385: oxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=385: oxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=385: oxxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=386: oxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=386: oxxxxx#9 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=386: oxxxxx#9 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=386: oxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=386: oxxxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=386: oxxxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=387: oxzxxx#9 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=387: oxzxxx#9 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=387: oxzxxx#9 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=387: oxzxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=387: oxzxxx#9 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=387: oxzxxx#9 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=388: ixxxxx#10 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=388: ixxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=388: ixxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=388: ixxxxx#10 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=388: ixxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=388: ixxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=389: ixxxxx#10 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=389: ixxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=389: ixxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=389: ixxxxx#10 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=389: ixxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=389: ixxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=390: ipzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=390: ipzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=390: ipzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=390: ipzxxx#10 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=390: ipzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=390: ipzxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=391: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=391: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=391: vxxxxx#10 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=391: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=391: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=391: vxxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=392: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=392: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=392: vxxxxx#10 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=392: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=392: vxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=392: vxxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=393: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=393: sxxxxx#10 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=393: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=393: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=393: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=393: sxxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=394: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=394: sxxxxx#10 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=394: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=394: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=394: sxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=394: sxxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=395: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=395: oxxxxx#10 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=395: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=395: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=395: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=395: oxxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=396: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=396: oxxxxx#10 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=396: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=396: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=396: oxxxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=396: oxxxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=397: opzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=397: opzxxx#10 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=397: opzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=397: opzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=397: opzxxx#10 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=397: opzxxx#10 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=398: ixxxxx#11 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=398: ixxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=398: ixxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=398: ixxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=398: ixxxxx#11 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=398: ixxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=399: ixxxxx#11 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=399: ixxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=399: ixxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=399: ixxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=399: ixxxxx#11 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=399: ixxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=400: imzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=400: imzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=400: imzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=400: imzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=400: imzxxx#11 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=400: imzxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=401: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=401: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=401: vxxxxx#11 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=401: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=401: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=401: vxxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=402: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=402: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=402: vxxxxx#11 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=402: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=402: vxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=402: vxxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=403: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=403: sxxxxx#11 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=403: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=403: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=403: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=403: sxxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=404: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=404: sxxxxx#11 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=404: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=404: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=404: sxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=404: sxxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=405: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=405: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=405: oxxxxx#11 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=405: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=405: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=405: oxxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=406: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=406: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=406: oxxxxx#11 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=406: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=406: oxxxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=406: oxxxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=407: omzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=407: omzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=407: omzxxx#11 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=407: omzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=407: omzxxx#11 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=407: omzxxx#11 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=408: ixxxxx#12 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=408: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=408: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=408: ixxxxx#12 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=408: ixxxxx#12 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01 } ); // itest=408: ixxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=409: ixxxxx#12 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=409: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=409: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=409: ixxxxx#12 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=409: ixxxxx#12 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999831e+01 } ); // itest=409: ixxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=410: ixzxxx#12 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=410: ixzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=410: ixzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=410: ixzxxx#12 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=410: ixzxxx#12 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=410: ixzxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=411: vxxxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=411: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=411: vxxxxx#12 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=411: vxxxxx#12 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=411: vxxxxx#12 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=411: vxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=412: vxxxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=412: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=412: vxxxxx#12 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=412: vxxxxx#12 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=412: vxxxxx#12 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=412: vxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=413: sxxxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=413: sxxxxx#12 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=413: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=413: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=413: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=413: sxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=414: sxxxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=414: sxxxxx#12 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=414: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=414: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=414: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=414: sxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=415: oxxxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=415: oxxxxx#12 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=415: oxxxxx#12 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=415: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=415: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=415: oxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=416: oxxxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=416: oxxxxx#12 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=416: oxxxxx#12 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999831e+01,     // itest=416: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=416: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=416: oxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=417: oxzxxx#12 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=417: oxzxxx#12 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=417: oxzxxx#12 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=417: oxzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=417: oxzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=417: oxzxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=418: ixxxxx#13 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=418: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=418: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=418: ixxxxx#13 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=418: ixxxxx#13 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=418: ixxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=419: ixxxxx#13 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=419: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=419: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=419: ixxxxx#13 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=419: ixxxxx#13 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=419: ixxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=420: ixzxxx#13 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=420: ixzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=420: ixzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=420: ixzxxx#13 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=420: ixzxxx#13 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=420: ixzxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=421: vxxxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=421: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=421: vxxxxx#13 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=421: vxxxxx#13 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=421: vxxxxx#13 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=421: vxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=422: vxxxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=422: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=422: vxxxxx#13 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=422: vxxxxx#13 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=422: vxxxxx#13 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=422: vxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=423: sxxxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=423: sxxxxx#13 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=423: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=423: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=423: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=423: sxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=424: sxxxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=424: sxxxxx#13 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=424: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=424: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=424: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=424: sxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=425: oxxxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=425: oxxxxx#13 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=425: oxxxxx#13 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=425: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=425: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=425: oxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=426: oxxxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=426: oxxxxx#13 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=426: oxxxxx#13 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=426: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=426: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=426: oxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=427: oxzxxx#13 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=427: oxzxxx#13 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=427: oxzxxx#13 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=427: oxzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=427: oxzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=427: oxzxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=428: ixxxxx#14 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=428: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=428: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=428: ixxxxx#14 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=428: ixxxxx#14 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=428: ixxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=429: ixxxxx#14 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=429: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=429: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=429: ixxxxx#14 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=429: ixxxxx#14 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=429: ixxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=430: ixzxxx#14 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=430: ixzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=430: ixzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=430: ixzxxx#14 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=430: ixzxxx#14 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=430: ixzxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=431: vxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=431: vxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=431: vxxxxx#14 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=431: vxxxxx#14 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=431: vxxxxx#14 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=431: vxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=432: vxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=432: vxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=432: vxxxxx#14 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=432: vxxxxx#14 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=432: vxxxxx#14 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=432: vxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=433: sxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=433: sxxxxx#14 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=433: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=433: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=433: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=433: sxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=434: sxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=434: sxxxxx#14 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=434: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=434: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=434: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=434: sxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=435: oxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=435: oxxxxx#14 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=435: oxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=435: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=435: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=435: oxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=436: oxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=436: oxxxxx#14 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=436: oxxxxx#14 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=436: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=436: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=436: oxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=437: oxzxxx#14 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=437: oxzxxx#14 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=437: oxzxxx#14 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=437: oxzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=437: oxzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=437: oxzxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=438: ixxxxx#15 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=438: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=438: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=438: ixxxxx#15 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=438: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=438: ixxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=439: ixxxxx#15 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=439: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=439: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=439: ixxxxx#15 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=439: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=439: ixxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=440: ipzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=440: ipzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=440: ipzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=440: ipzxxx#15 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=440: ipzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=440: ipzxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=441: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=441: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=441: vxxxxx#15 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=441: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=441: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=441: vxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=442: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=442: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=442: vxxxxx#15 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=442: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=442: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=442: vxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=443: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=443: sxxxxx#15 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=443: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=443: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=443: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=443: sxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=444: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=444: sxxxxx#15 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=444: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=444: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=444: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=444: sxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=445: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=445: oxxxxx#15 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=445: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=445: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=445: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=445: oxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=446: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=446: oxxxxx#15 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=446: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=446: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=446: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=446: oxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=447: opzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=447: opzxxx#15 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=447: opzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=447: opzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=447: opzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=447: opzxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=448: ixxxxx#16 nsp=1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=448: ixxxxx#16 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=448: ixxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=448: ixxxxx#16 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=448: ixxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=448: ixxxxx#16 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=449: ixxxxx#16 nsp=1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=449: ixxxxx#16 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=449: ixxxxx#16 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=449: ixxxxx#16 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=449: ixxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=449: ixxxxx#16 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=450: vxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=450: vxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=450: vxxxxx#16 nsp=1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=450: vxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=450: vxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=450: vxxxxx#16 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=451: vxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=451: vxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=451: vxxxxx#16 nsp=1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=451: vxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=451: vxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=451: vxxxxx#16 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=452: sxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=452: sxxxxx#16 nsp=1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=452: sxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=452: sxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=452: sxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=452: sxxxxx#16 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=453: sxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=453: sxxxxx#16 nsp=1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=453: sxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=453: sxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=453: sxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=453: sxxxxx#16 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=454: oxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=454: oxxxxx#16 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=454: oxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=454: oxxxxx#16 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=454: oxxxxx#16 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=454: oxxxxx#16 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=455: oxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=455: oxxxxx#16 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=455: oxxxxx#16 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=455: oxxxxx#16 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=455: oxxxxx#16 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=455: oxxxxx#16 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=456: ixxxxx#17 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=456: ixxxxx#17 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=456: ixxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=456: ixxxxx#17 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=456: ixxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=456: ixxxxx#17 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=457: ixxxxx#17 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=457: ixxxxx#17 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=457: ixxxxx#17 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=457: ixxxxx#17 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=457: ixxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=457: ixxxxx#17 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=458: vxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=458: vxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=458: vxxxxx#17 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=458: vxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=458: vxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=458: vxxxxx#17 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=459: vxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=459: vxxxxx#17 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=459: vxxxxx#17 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=459: vxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=459: vxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=459: vxxxxx#17 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=460: sxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=460: sxxxxx#17 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=460: sxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=460: sxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=460: sxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=460: sxxxxx#17 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=461: sxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=461: sxxxxx#17 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=461: sxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=461: sxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=461: sxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=461: sxxxxx#17 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=462: oxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=462: oxxxxx#17 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=462: oxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=462: oxxxxx#17 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=462: oxxxxx#17 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=462: oxxxxx#17 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=463: oxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=463: oxxxxx#17 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=463: oxxxxx#17 nsp=1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=463: oxxxxx#17 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=463: oxxxxx#17 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=463: oxxxxx#17 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=464: ixxxxx#18 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=464: ixxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=464: ixxxxx#18 nsp=1 mass=400
+    -1.414213562373095e+01,  0.000000000000000e+00,     // itest=464: ixxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=464: ixxxxx#18 nsp=1 mass=400
+    -2.828427124746190e+01,  0.000000000000000e+00 } ); // itest=464: ixxxxx#18 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=465: ixxxxx#18 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=465: ixxxxx#18 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=465: ixxxxx#18 nsp=1 mass=-400
+     1.414213562373095e+01, -0.000000000000000e+00,     // itest=465: ixxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=465: ixxxxx#18 nsp=1 mass=-400
+    -2.828427124746190e+01,  0.000000000000000e+00 } ); // itest=465: ixxxxx#18 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=466: vxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=466: vxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=466: vxxxxx#18 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=466: vxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=466: vxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=466: vxxxxx#18 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=467: vxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=467: vxxxxx#18 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=467: vxxxxx#18 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=467: vxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=467: vxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=467: vxxxxx#18 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=468: sxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=468: sxxxxx#18 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=468: sxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=468: sxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=468: sxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=468: sxxxxx#18 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=469: sxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=469: sxxxxx#18 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=469: sxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=469: sxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=469: sxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=469: sxxxxx#18 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=470: oxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=470: oxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=470: oxxxxx#18 nsp=1 mass=400
+    -2.828427124746190e+01,  0.000000000000000e+00,     // itest=470: oxxxxx#18 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=470: oxxxxx#18 nsp=1 mass=400
+    -1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=470: oxxxxx#18 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=471: oxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=471: oxxxxx#18 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=471: oxxxxx#18 nsp=1 mass=-400
+    -2.828427124746190e+01,  0.000000000000000e+00,     // itest=471: oxxxxx#18 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=471: oxxxxx#18 nsp=1 mass=-400
+     1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=471: oxxxxx#18 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=472: ixxxxx#19 nsp=1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=472: ixxxxx#19 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=472: ixxxxx#19 nsp=1 mass=400
+     5.999999999999999e+00,  7.999999999999999e+00,     // itest=472: ixxxxx#19 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=472: ixxxxx#19 nsp=1 mass=400
+     1.200000000000000e+01,  1.600000000000000e+01 } ); // itest=472: ixxxxx#19 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=473: ixxxxx#19 nsp=1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=473: ixxxxx#19 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=473: ixxxxx#19 nsp=1 mass=-400
+    -5.999999999999999e+00, -7.999999999999999e+00,     // itest=473: ixxxxx#19 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=473: ixxxxx#19 nsp=1 mass=-400
+     1.200000000000000e+01,  1.600000000000000e+01 } ); // itest=473: ixxxxx#19 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=474: vxxxxx#19 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=474: vxxxxx#19 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=474: vxxxxx#19 nsp=1 mass=400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=474: vxxxxx#19 nsp=1 mass=400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=474: vxxxxx#19 nsp=1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=474: vxxxxx#19 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=475: vxxxxx#19 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=475: vxxxxx#19 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=475: vxxxxx#19 nsp=1 mass=-400
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=475: vxxxxx#19 nsp=1 mass=-400
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=475: vxxxxx#19 nsp=1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=475: vxxxxx#19 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=476: sxxxxx#19 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=476: sxxxxx#19 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=476: sxxxxx#19 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=476: sxxxxx#19 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=476: sxxxxx#19 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=476: sxxxxx#19 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=477: sxxxxx#19 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=477: sxxxxx#19 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=477: sxxxxx#19 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=477: sxxxxx#19 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=477: sxxxxx#19 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=477: sxxxxx#19 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=478: oxxxxx#19 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=478: oxxxxx#19 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=478: oxxxxx#19 nsp=1 mass=400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=478: oxxxxx#19 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=478: oxxxxx#19 nsp=1 mass=400
+     5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=478: oxxxxx#19 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=479: oxxxxx#19 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=479: oxxxxx#19 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=479: oxxxxx#19 nsp=1 mass=-400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=479: oxxxxx#19 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=479: oxxxxx#19 nsp=1 mass=-400
+    -5.999999999999999e+00,  7.999999999999999e+00 } ); // itest=479: oxxxxx#19 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=480: ixxxxx#20 nsp=1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=480: ixxxxx#20 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=480: ixxxxx#20 nsp=1 mass=400
+    -7.999999999999999e+00, -5.999999999999999e+00,     // itest=480: ixxxxx#20 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=480: ixxxxx#20 nsp=1 mass=400
+    -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=480: ixxxxx#20 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=481: ixxxxx#20 nsp=1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=481: ixxxxx#20 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=481: ixxxxx#20 nsp=1 mass=-400
+     7.999999999999999e+00,  5.999999999999999e+00,     // itest=481: ixxxxx#20 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=481: ixxxxx#20 nsp=1 mass=-400
+    -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=481: ixxxxx#20 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=482: vxxxxx#20 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=482: vxxxxx#20 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=482: vxxxxx#20 nsp=1 mass=400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=482: vxxxxx#20 nsp=1 mass=400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=482: vxxxxx#20 nsp=1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=482: vxxxxx#20 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=483: vxxxxx#20 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=483: vxxxxx#20 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=483: vxxxxx#20 nsp=1 mass=-400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=483: vxxxxx#20 nsp=1 mass=-400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=483: vxxxxx#20 nsp=1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=483: vxxxxx#20 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=484: sxxxxx#20 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=484: sxxxxx#20 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=484: sxxxxx#20 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=484: sxxxxx#20 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=484: sxxxxx#20 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=484: sxxxxx#20 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=485: sxxxxx#20 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=485: sxxxxx#20 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=485: sxxxxx#20 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=485: sxxxxx#20 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=485: sxxxxx#20 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=485: sxxxxx#20 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=486: oxxxxx#20 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=486: oxxxxx#20 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=486: oxxxxx#20 nsp=1 mass=400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=486: oxxxxx#20 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=486: oxxxxx#20 nsp=1 mass=400
+    -7.999999999999999e+00,  5.999999999999999e+00 } ); // itest=486: oxxxxx#20 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=487: oxxxxx#20 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=487: oxxxxx#20 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=487: oxxxxx#20 nsp=1 mass=-400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=487: oxxxxx#20 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=487: oxxxxx#20 nsp=1 mass=-400
+     7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=487: oxxxxx#20 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=488: ixxxxx#21 nsp=1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=488: ixxxxx#21 nsp=1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=488: ixxxxx#21 nsp=1 mass=400
+     4.931969619160719e+00,  5.260767593771432e+00,     // itest=488: ixxxxx#21 nsp=1 mass=400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=488: ixxxxx#21 nsp=1 mass=400
+     9.863939238321439e+00,  1.052153518754287e+01 } ); // itest=488: ixxxxx#21 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=489: ixxxxx#21 nsp=1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=489: ixxxxx#21 nsp=1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=489: ixxxxx#21 nsp=1 mass=-400
+    -4.931969619160719e+00, -5.260767593771432e+00,     // itest=489: ixxxxx#21 nsp=1 mass=-400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=489: ixxxxx#21 nsp=1 mass=-400
+     9.863939238321439e+00,  1.052153518754287e+01 } ); // itest=489: ixxxxx#21 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=490: vxxxxx#21 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=490: vxxxxx#21 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=490: vxxxxx#21 nsp=1 mass=400
+    -2.321373168788980e-01, -5.158607041753289e-01,     // itest=490: vxxxxx#21 nsp=1 mass=400
+    -2.476131380041579e-01,  4.836194101643708e-01,     // itest=490: vxxxxx#21 nsp=1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=490: vxxxxx#21 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=491: vxxxxx#21 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=491: vxxxxx#21 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=491: vxxxxx#21 nsp=1 mass=-400
+    -2.321373168788980e-01, -5.158607041753289e-01,     // itest=491: vxxxxx#21 nsp=1 mass=-400
+    -2.476131380041579e-01,  4.836194101643708e-01,     // itest=491: vxxxxx#21 nsp=1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=491: vxxxxx#21 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=492: sxxxxx#21 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=492: sxxxxx#21 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=492: sxxxxx#21 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=492: sxxxxx#21 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=492: sxxxxx#21 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=492: sxxxxx#21 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=493: sxxxxx#21 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=493: sxxxxx#21 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=493: sxxxxx#21 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=493: sxxxxx#21 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=493: sxxxxx#21 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=493: sxxxxx#21 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=494: oxxxxx#21 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=494: oxxxxx#21 nsp=1 mass=400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=494: oxxxxx#21 nsp=1 mass=400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=494: oxxxxx#21 nsp=1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=494: oxxxxx#21 nsp=1 mass=400
+     4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=494: oxxxxx#21 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=495: oxxxxx#21 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=495: oxxxxx#21 nsp=1 mass=-400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=495: oxxxxx#21 nsp=1 mass=-400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=495: oxxxxx#21 nsp=1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=495: oxxxxx#21 nsp=1 mass=-400
+    -4.931969619160719e+00,  5.260767593771432e+00 } ); // itest=495: oxxxxx#21 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=496: ixxxxx#22 nsp=1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=496: ixxxxx#22 nsp=1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=496: ixxxxx#22 nsp=1 mass=400
+     8.320502943378436e+00,  8.875203139603666e+00,     // itest=496: ixxxxx#22 nsp=1 mass=400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=496: ixxxxx#22 nsp=1 mass=400
+     1.664100588675688e+01,  1.775040627920733e+01 } ); // itest=496: ixxxxx#22 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=497: ixxxxx#22 nsp=1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=497: ixxxxx#22 nsp=1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=497: ixxxxx#22 nsp=1 mass=-400
+    -8.320502943378436e+00, -8.875203139603666e+00,     // itest=497: ixxxxx#22 nsp=1 mass=-400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=497: ixxxxx#22 nsp=1 mass=-400
+     1.664100588675688e+01,  1.775040627920733e+01 } ); // itest=497: ixxxxx#22 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=498: vxxxxx#22 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=498: vxxxxx#22 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=498: vxxxxx#22 nsp=1 mass=400
+     2.321373168788980e-01, -5.158607041753289e-01,     // itest=498: vxxxxx#22 nsp=1 mass=400
+     2.476131380041579e-01,  4.836194101643708e-01,     // itest=498: vxxxxx#22 nsp=1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=498: vxxxxx#22 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=499: vxxxxx#22 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=499: vxxxxx#22 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=499: vxxxxx#22 nsp=1 mass=-400
+     2.321373168788980e-01, -5.158607041753289e-01,     // itest=499: vxxxxx#22 nsp=1 mass=-400
+     2.476131380041579e-01,  4.836194101643708e-01,     // itest=499: vxxxxx#22 nsp=1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=499: vxxxxx#22 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=500: sxxxxx#22 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=500: sxxxxx#22 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=500: sxxxxx#22 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=500: sxxxxx#22 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=500: sxxxxx#22 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=500: sxxxxx#22 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=501: sxxxxx#22 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=501: sxxxxx#22 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=501: sxxxxx#22 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=501: sxxxxx#22 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=501: sxxxxx#22 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=501: sxxxxx#22 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=502: oxxxxx#22 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=502: oxxxxx#22 nsp=1 mass=400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=502: oxxxxx#22 nsp=1 mass=400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=502: oxxxxx#22 nsp=1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=502: oxxxxx#22 nsp=1 mass=400
+     8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=502: oxxxxx#22 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=503: oxxxxx#22 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=503: oxxxxx#22 nsp=1 mass=-400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=503: oxxxxx#22 nsp=1 mass=-400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=503: oxxxxx#22 nsp=1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=503: oxxxxx#22 nsp=1 mass=-400
+    -8.320502943378436e+00,  8.875203139603666e+00 } ); // itest=503: oxxxxx#22 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=504: ixxxxx#23 nsp=1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=504: ixxxxx#23 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=504: ixxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=504: ixxxxx#23 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=504: ixxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=504: ixxxxx#23 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=505: ixxxxx#23 nsp=1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=505: ixxxxx#23 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=505: ixxxxx#23 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=505: ixxxxx#23 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=505: ixxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=505: ixxxxx#23 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=506: vxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=506: vxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=506: vxxxxx#23 nsp=1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=506: vxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=506: vxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=506: vxxxxx#23 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=507: vxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=507: vxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=507: vxxxxx#23 nsp=1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=507: vxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=507: vxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=507: vxxxxx#23 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=508: sxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=508: sxxxxx#23 nsp=1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=508: sxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=508: sxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=508: sxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=508: sxxxxx#23 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=509: sxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=509: sxxxxx#23 nsp=1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=509: sxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=509: sxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=509: sxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=509: sxxxxx#23 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=510: oxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=510: oxxxxx#23 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=510: oxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=510: oxxxxx#23 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=510: oxxxxx#23 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=510: oxxxxx#23 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=511: oxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=511: oxxxxx#23 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=511: oxxxxx#23 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=511: oxxxxx#23 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=511: oxxxxx#23 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=511: oxxxxx#23 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=512: ixxxxx#24 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=512: ixxxxx#24 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=512: ixxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=512: ixxxxx#24 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=512: ixxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=512: ixxxxx#24 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=513: ixxxxx#24 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=513: ixxxxx#24 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=513: ixxxxx#24 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=513: ixxxxx#24 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=513: ixxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=513: ixxxxx#24 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=514: vxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=514: vxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=514: vxxxxx#24 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=514: vxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=514: vxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=514: vxxxxx#24 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=515: vxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=515: vxxxxx#24 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=515: vxxxxx#24 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=515: vxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=515: vxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=515: vxxxxx#24 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=516: sxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=516: sxxxxx#24 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=516: sxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=516: sxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=516: sxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=516: sxxxxx#24 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=517: sxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=517: sxxxxx#24 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=517: sxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=517: sxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=517: sxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=517: sxxxxx#24 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=518: oxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=518: oxxxxx#24 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=518: oxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=518: oxxxxx#24 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=518: oxxxxx#24 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=518: oxxxxx#24 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=519: oxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=519: oxxxxx#24 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=519: oxxxxx#24 nsp=1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=519: oxxxxx#24 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=519: oxxxxx#24 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=519: oxxxxx#24 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=520: ixxxxx#25 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=520: ixxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=520: ixxxxx#25 nsp=1 mass=400
+    -1.414213562373095e+01,  0.000000000000000e+00,     // itest=520: ixxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=520: ixxxxx#25 nsp=1 mass=400
+    -2.828427124746190e+01,  0.000000000000000e+00 } ); // itest=520: ixxxxx#25 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=521: ixxxxx#25 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=521: ixxxxx#25 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=521: ixxxxx#25 nsp=1 mass=-400
+     1.414213562373095e+01, -0.000000000000000e+00,     // itest=521: ixxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=521: ixxxxx#25 nsp=1 mass=-400
+    -2.828427124746190e+01,  0.000000000000000e+00 } ); // itest=521: ixxxxx#25 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=522: vxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=522: vxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=522: vxxxxx#25 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=522: vxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=522: vxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=522: vxxxxx#25 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=523: vxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=523: vxxxxx#25 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=523: vxxxxx#25 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=523: vxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=523: vxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=523: vxxxxx#25 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=524: sxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=524: sxxxxx#25 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=524: sxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=524: sxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=524: sxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=524: sxxxxx#25 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=525: sxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=525: sxxxxx#25 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=525: sxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=525: sxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=525: sxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=525: sxxxxx#25 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=526: oxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=526: oxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=526: oxxxxx#25 nsp=1 mass=400
+    -2.828427124746190e+01,  0.000000000000000e+00,     // itest=526: oxxxxx#25 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=526: oxxxxx#25 nsp=1 mass=400
+    -1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=526: oxxxxx#25 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=527: oxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=527: oxxxxx#25 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=527: oxxxxx#25 nsp=1 mass=-400
+    -2.828427124746190e+01,  0.000000000000000e+00,     // itest=527: oxxxxx#25 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=527: oxxxxx#25 nsp=1 mass=-400
+     1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=527: oxxxxx#25 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=528: ixxxxx#26 nsp=1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=528: ixxxxx#26 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=528: ixxxxx#26 nsp=1 mass=400
+     5.999999999999999e+00,  7.999999999999999e+00,     // itest=528: ixxxxx#26 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=528: ixxxxx#26 nsp=1 mass=400
+     1.200000000000000e+01,  1.600000000000000e+01 } ); // itest=528: ixxxxx#26 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=529: ixxxxx#26 nsp=1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=529: ixxxxx#26 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=529: ixxxxx#26 nsp=1 mass=-400
+    -5.999999999999999e+00, -7.999999999999999e+00,     // itest=529: ixxxxx#26 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=529: ixxxxx#26 nsp=1 mass=-400
+     1.200000000000000e+01,  1.600000000000000e+01 } ); // itest=529: ixxxxx#26 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=530: vxxxxx#26 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=530: vxxxxx#26 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=530: vxxxxx#26 nsp=1 mass=400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=530: vxxxxx#26 nsp=1 mass=400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=530: vxxxxx#26 nsp=1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=530: vxxxxx#26 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=531: vxxxxx#26 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=531: vxxxxx#26 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=531: vxxxxx#26 nsp=1 mass=-400
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=531: vxxxxx#26 nsp=1 mass=-400
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=531: vxxxxx#26 nsp=1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=531: vxxxxx#26 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=532: sxxxxx#26 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=532: sxxxxx#26 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=532: sxxxxx#26 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=532: sxxxxx#26 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=532: sxxxxx#26 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=532: sxxxxx#26 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=533: sxxxxx#26 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=533: sxxxxx#26 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=533: sxxxxx#26 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=533: sxxxxx#26 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=533: sxxxxx#26 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=533: sxxxxx#26 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=534: oxxxxx#26 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=534: oxxxxx#26 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=534: oxxxxx#26 nsp=1 mass=400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=534: oxxxxx#26 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=534: oxxxxx#26 nsp=1 mass=400
+     5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=534: oxxxxx#26 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=535: oxxxxx#26 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=535: oxxxxx#26 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=535: oxxxxx#26 nsp=1 mass=-400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=535: oxxxxx#26 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=535: oxxxxx#26 nsp=1 mass=-400
+    -5.999999999999999e+00,  7.999999999999999e+00 } ); // itest=535: oxxxxx#26 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=536: ixxxxx#27 nsp=1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=536: ixxxxx#27 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=536: ixxxxx#27 nsp=1 mass=400
+    -7.999999999999999e+00, -5.999999999999999e+00,     // itest=536: ixxxxx#27 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=536: ixxxxx#27 nsp=1 mass=400
+    -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=536: ixxxxx#27 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=537: ixxxxx#27 nsp=1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=537: ixxxxx#27 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=537: ixxxxx#27 nsp=1 mass=-400
+     7.999999999999999e+00,  5.999999999999999e+00,     // itest=537: ixxxxx#27 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=537: ixxxxx#27 nsp=1 mass=-400
+    -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=537: ixxxxx#27 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=538: vxxxxx#27 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=538: vxxxxx#27 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=538: vxxxxx#27 nsp=1 mass=400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=538: vxxxxx#27 nsp=1 mass=400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=538: vxxxxx#27 nsp=1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=538: vxxxxx#27 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=539: vxxxxx#27 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=539: vxxxxx#27 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=539: vxxxxx#27 nsp=1 mass=-400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=539: vxxxxx#27 nsp=1 mass=-400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=539: vxxxxx#27 nsp=1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=539: vxxxxx#27 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=540: sxxxxx#27 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=540: sxxxxx#27 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=540: sxxxxx#27 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=540: sxxxxx#27 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=540: sxxxxx#27 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=540: sxxxxx#27 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=541: sxxxxx#27 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=541: sxxxxx#27 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=541: sxxxxx#27 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=541: sxxxxx#27 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=541: sxxxxx#27 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=541: sxxxxx#27 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=542: oxxxxx#27 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=542: oxxxxx#27 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=542: oxxxxx#27 nsp=1 mass=400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=542: oxxxxx#27 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=542: oxxxxx#27 nsp=1 mass=400
+    -7.999999999999999e+00,  5.999999999999999e+00 } ); // itest=542: oxxxxx#27 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=543: oxxxxx#27 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=543: oxxxxx#27 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=543: oxxxxx#27 nsp=1 mass=-400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=543: oxxxxx#27 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=543: oxxxxx#27 nsp=1 mass=-400
+     7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=543: oxxxxx#27 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=544: ixxxxx#28 nsp=1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=544: ixxxxx#28 nsp=1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=544: ixxxxx#28 nsp=1 mass=400
+     4.931969619160719e+00,  5.260767593771432e+00,     // itest=544: ixxxxx#28 nsp=1 mass=400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=544: ixxxxx#28 nsp=1 mass=400
+     9.863939238321439e+00,  1.052153518754287e+01 } ); // itest=544: ixxxxx#28 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=545: ixxxxx#28 nsp=1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=545: ixxxxx#28 nsp=1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=545: ixxxxx#28 nsp=1 mass=-400
+    -4.931969619160719e+00, -5.260767593771432e+00,     // itest=545: ixxxxx#28 nsp=1 mass=-400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=545: ixxxxx#28 nsp=1 mass=-400
+     9.863939238321439e+00,  1.052153518754287e+01 } ); // itest=545: ixxxxx#28 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=546: vxxxxx#28 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=546: vxxxxx#28 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=546: vxxxxx#28 nsp=1 mass=400
+    -2.321373168788980e-01, -5.158607041753289e-01,     // itest=546: vxxxxx#28 nsp=1 mass=400
+    -2.476131380041579e-01,  4.836194101643708e-01,     // itest=546: vxxxxx#28 nsp=1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=546: vxxxxx#28 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=547: vxxxxx#28 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=547: vxxxxx#28 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=547: vxxxxx#28 nsp=1 mass=-400
+    -2.321373168788980e-01, -5.158607041753289e-01,     // itest=547: vxxxxx#28 nsp=1 mass=-400
+    -2.476131380041579e-01,  4.836194101643708e-01,     // itest=547: vxxxxx#28 nsp=1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=547: vxxxxx#28 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=548: sxxxxx#28 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=548: sxxxxx#28 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=548: sxxxxx#28 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=548: sxxxxx#28 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=548: sxxxxx#28 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=548: sxxxxx#28 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=549: sxxxxx#28 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=549: sxxxxx#28 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=549: sxxxxx#28 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=549: sxxxxx#28 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=549: sxxxxx#28 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=549: sxxxxx#28 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=550: oxxxxx#28 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=550: oxxxxx#28 nsp=1 mass=400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=550: oxxxxx#28 nsp=1 mass=400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=550: oxxxxx#28 nsp=1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=550: oxxxxx#28 nsp=1 mass=400
+     4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=550: oxxxxx#28 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=551: oxxxxx#28 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=551: oxxxxx#28 nsp=1 mass=-400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=551: oxxxxx#28 nsp=1 mass=-400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=551: oxxxxx#28 nsp=1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=551: oxxxxx#28 nsp=1 mass=-400
+    -4.931969619160719e+00,  5.260767593771432e+00 } ); // itest=551: oxxxxx#28 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=552: ixxxxx#29 nsp=1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=552: ixxxxx#29 nsp=1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=552: ixxxxx#29 nsp=1 mass=400
+     8.320502943378436e+00,  8.875203139603666e+00,     // itest=552: ixxxxx#29 nsp=1 mass=400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=552: ixxxxx#29 nsp=1 mass=400
+     1.664100588675688e+01,  1.775040627920733e+01 } ); // itest=552: ixxxxx#29 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=553: ixxxxx#29 nsp=1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=553: ixxxxx#29 nsp=1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=553: ixxxxx#29 nsp=1 mass=-400
+    -8.320502943378436e+00, -8.875203139603666e+00,     // itest=553: ixxxxx#29 nsp=1 mass=-400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=553: ixxxxx#29 nsp=1 mass=-400
+     1.664100588675688e+01,  1.775040627920733e+01 } ); // itest=553: ixxxxx#29 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=554: vxxxxx#29 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=554: vxxxxx#29 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=554: vxxxxx#29 nsp=1 mass=400
+     2.321373168788980e-01, -5.158607041753289e-01,     // itest=554: vxxxxx#29 nsp=1 mass=400
+     2.476131380041579e-01,  4.836194101643708e-01,     // itest=554: vxxxxx#29 nsp=1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=554: vxxxxx#29 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=555: vxxxxx#29 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=555: vxxxxx#29 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=555: vxxxxx#29 nsp=1 mass=-400
+     2.321373168788980e-01, -5.158607041753289e-01,     // itest=555: vxxxxx#29 nsp=1 mass=-400
+     2.476131380041579e-01,  4.836194101643708e-01,     // itest=555: vxxxxx#29 nsp=1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=555: vxxxxx#29 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=556: sxxxxx#29 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=556: sxxxxx#29 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=556: sxxxxx#29 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=556: sxxxxx#29 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=556: sxxxxx#29 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=556: sxxxxx#29 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=557: sxxxxx#29 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=557: sxxxxx#29 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=557: sxxxxx#29 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=557: sxxxxx#29 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=557: sxxxxx#29 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=557: sxxxxx#29 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=558: oxxxxx#29 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=558: oxxxxx#29 nsp=1 mass=400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=558: oxxxxx#29 nsp=1 mass=400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=558: oxxxxx#29 nsp=1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=558: oxxxxx#29 nsp=1 mass=400
+     8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=558: oxxxxx#29 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=559: oxxxxx#29 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=559: oxxxxx#29 nsp=1 mass=-400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=559: oxxxxx#29 nsp=1 mass=-400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=559: oxxxxx#29 nsp=1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=559: oxxxxx#29 nsp=1 mass=-400
+    -8.320502943378436e+00,  8.875203139603666e+00 } ); // itest=559: oxxxxx#29 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=560: ixxxxx#30 nsp=1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=560: ixxxxx#30 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=560: ixxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=560: ixxxxx#30 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=560: ixxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=560: ixxxxx#30 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=561: ixxxxx#30 nsp=1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=561: ixxxxx#30 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=561: ixxxxx#30 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=561: ixxxxx#30 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=561: ixxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=561: ixxxxx#30 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=562: vxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=562: vxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=562: vxxxxx#30 nsp=1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=562: vxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=562: vxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=562: vxxxxx#30 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=563: vxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=563: vxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=563: vxxxxx#30 nsp=1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=563: vxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=563: vxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=563: vxxxxx#30 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=564: sxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=564: sxxxxx#30 nsp=1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=564: sxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=564: sxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=564: sxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=564: sxxxxx#30 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=565: sxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=565: sxxxxx#30 nsp=1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=565: sxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=565: sxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=565: sxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=565: sxxxxx#30 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=566: oxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=566: oxxxxx#30 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=566: oxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=566: oxxxxx#30 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=566: oxxxxx#30 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=566: oxxxxx#30 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=567: oxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=567: oxxxxx#30 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=567: oxxxxx#30 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=567: oxxxxx#30 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=567: oxxxxx#30 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=567: oxxxxx#30 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=568: ixxxxx#31 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=568: ixxxxx#31 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=568: ixxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=568: ixxxxx#31 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=568: ixxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=568: ixxxxx#31 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=569: ixxxxx#31 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=569: ixxxxx#31 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=569: ixxxxx#31 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=569: ixxxxx#31 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=569: ixxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=569: ixxxxx#31 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=570: vxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=570: vxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=570: vxxxxx#31 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=570: vxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=570: vxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=570: vxxxxx#31 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=571: vxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=571: vxxxxx#31 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=571: vxxxxx#31 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=571: vxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=571: vxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=571: vxxxxx#31 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=572: sxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=572: sxxxxx#31 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=572: sxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=572: sxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=572: sxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=572: sxxxxx#31 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=573: sxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=573: sxxxxx#31 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=573: sxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=573: sxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=573: sxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=573: sxxxxx#31 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=574: oxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=574: oxxxxx#31 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=574: oxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=574: oxxxxx#31 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=574: oxxxxx#31 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=574: oxxxxx#31 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=575: oxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=575: oxxxxx#31 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=575: oxxxxx#31 nsp=1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=575: oxxxxx#31 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=575: oxxxxx#31 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=575: oxxxxx#31 nsp=1 mass=-400
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timer.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timer.h
new file mode 100644
index 0000000000..0f2712facf
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timer.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUTIMER_H
+#define MGONGPUTIMER_H 1
+
+#include <chrono>
+#include <iostream>
+
+namespace mgOnGpu
+{
+
+  /*
+  high_resolution_clock
+  steady_clock
+  system_clock
+
+  from https://www.modernescpp.com/index.php/the-three-clocks
+  and https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c
+  */
+
+  template<typename T>
+  class Timer
+  {
+  public:
+    Timer()
+      : m_StartTime( T::now() ) {}
+    virtual ~Timer() {}
+    void Start();
+    float GetDuration();
+    void Info();
+  private:
+    typedef typename T::time_point TTP;
+    TTP m_StartTime;
+  };
+
+  template<typename T>
+  void
+  Timer<T>::Start()
+  {
+    m_StartTime = T::now();
+  }
+
+  template<typename T>
+  float
+  Timer<T>::GetDuration()
+  {
+    std::chrono::duration<float> duration = T::now() - m_StartTime;
+    return duration.count();
+  }
+
+  template<typename T>
+  void
+  Timer<T>::Info()
+  {
+    typedef typename T::period TPER;
+    typedef typename std::ratio_multiply<TPER, std::kilo> MilliSec;
+    typedef typename std::ratio_multiply<TPER, std::mega> MicroSec;
+    std::cout << std::boolalpha << std::endl;
+    std::cout << "clock info: " << std::endl;
+    std::cout << "  is steady: " << T::is_steady << std::endl;
+    std::cout << "  precision: " << TPER::num << "/" << TPER::den << " second " << std::endl;
+    std::cout << std::fixed;
+    std::cout << "             " << static_cast<double>( MilliSec::num ) / MilliSec::den << " milliseconds " << std::endl;
+    std::cout << "             " << static_cast<double>( MicroSec::num ) / MicroSec::den << " microseconds " << std::endl;
+    std::cout << std::endl;
+  }
+
+}
+#endif // MGONGPUTIMER_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h
new file mode 100644
index 0000000000..90468bd768
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h
@@ -0,0 +1,161 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+
+#ifndef MGONGPUTIMERMAP_H
+#define MGONGPUTIMERMAP_H 1
+
+#include <cassert>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "nvtx.h"
+#pragma GCC diagnostic pop
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+namespace mgOnGpu
+{
+  class TimerMap
+  {
+
+  public:
+
+    TimerMap()
+      : m_timer(), m_active( "" ), m_partitionTimers(), m_partitionIds() {}
+    virtual ~TimerMap() {}
+
+    // Start the timer for a specific partition (key must be a non-empty string)
+    // Stop the timer for the current partition if there is one active
+    float start( const std::string& key )
+    {
+      assert( key != "" );
+      // Close the previously active partition
+      float last = stop();
+      // Switch to a new partition
+      m_timer.Start();
+      m_active = key;
+      if( m_partitionTimers.find( key ) == m_partitionTimers.end() )
+      {
+        m_partitionIds[key] = m_partitionTimers.size();
+        m_partitionTimers[key] = 0;
+      }
+      // Open a new Cuda NVTX range
+      NVTX_PUSH( key.c_str(), m_partitionIds[key] );
+      // Return last duration
+      return last;
+    }
+
+    // Stop the timer for the current partition if there is one active
+    float stop()
+    {
+      // Close the previously active partition
+      float last = 0;
+      if( m_active != "" )
+      {
+        last = m_timer.GetDuration();
+        m_partitionTimers[m_active] += last;
+      }
+      m_active = "";
+      // Close the current Cuda NVTX range
+      NVTX_POP();
+      // Return last duration
+      return last;
+    }
+
+    // Dump the overall results
+    void dump( std::ostream& ostr = std::cout, bool json = false )
+    {
+      // Improve key formatting
+      const std::string totalKey = "TOTAL      "; // "TOTAL (ANY)"?
+      //const std::string totalBut2Key = "TOTAL (n-2)";
+      const std::string total123Key = "TOTAL (123)";
+      const std::string total23Key = "TOTAL  (23)";
+      const std::string total1Key = "TOTAL   (1)";
+      const std::string total2Key = "TOTAL   (2)";
+      const std::string total3Key = "TOTAL   (3)";
+      const std::string total3aKey = "TOTAL  (3a)";
+      size_t maxsize = 0;
+      for( auto ip: m_partitionTimers )
+        maxsize = std::max( maxsize, ip.first.size() );
+      maxsize = std::max( maxsize, totalKey.size() );
+      // Compute the overall total
+      //size_t ipart = 0;
+      float total = 0;
+      //float totalBut2 = 0;
+      float total123 = 0;
+      float total23 = 0;
+      float total1 = 0;
+      float total2 = 0;
+      float total3 = 0;
+      float total3a = 0;
+      for( auto ip: m_partitionTimers )
+      {
+        total += ip.second;
+        //if ( ipart != 0 && ipart+1 != m_partitionTimers.size() ) totalBut2 += ip.second;
+        if( ip.first[0] == '1' || ip.first[0] == '2' || ip.first[0] == '3' ) total123 += ip.second;
+        if( ip.first[0] == '2' || ip.first[0] == '3' ) total23 += ip.second;
+        if( ip.first[0] == '1' ) total1 += ip.second;
+        if( ip.first[0] == '2' ) total2 += ip.second;
+        if( ip.first[0] == '3' ) total3 += ip.second;
+        if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second;
+        //ipart++;
+      }
+      // Dump individual partition timers and the overall total
+      if( json )
+      {
+        std::string s1 = "\"", s2 = "\" : \"", s3 = " sec\",";
+        ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats
+        ostr << std::fixed;             // fixed format: affects all floats
+        for( auto ip: m_partitionTimers )
+          ostr << s1 << ip.first << s2 << ip.second << s3 << std::endl;
+        ostr << s1 << totalKey << s2 << total << s3 << std::endl
+             << s1 << total123Key << s2 << total123 << s3 << std::endl
+             << s1 << total23Key << s2 << total23 << s3 << std::endl
+             << s1 << total3Key << s2 << total3 << s3 << std::endl
+             << s1 << total3aKey << s2 << total3a << " sec \"" << std::endl;
+        ostr << std::defaultfloat; // default format: affects all floats
+      }
+      else
+      {
+        // NB: 'setw' affects only the next field (of any type)
+        ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats
+        ostr << std::fixed;             // fixed format: affects all floats
+        for( auto ip: m_partitionTimers )
+          ostr << std::setw( maxsize ) << ip.first << " : "
+               << std::setw( 12 ) << ip.second << " sec" << std::endl;
+        ostr << std::setw( maxsize ) << totalKey << " : "
+             << std::setw( 12 ) << total << " sec" << std::endl
+             << std::setw( maxsize ) << total123Key << " : "
+             << std::setw( 12 ) << total123 << " sec" << std::endl
+             << std::setw( maxsize ) << total23Key << " : "
+             << std::setw( 12 ) << total23 << " sec" << std::endl
+             << std::setw( maxsize ) << total1Key << " : "
+             << std::setw( 12 ) << total1 << " sec" << std::endl
+             << std::setw( maxsize ) << total2Key << " : "
+             << std::setw( 12 ) << total2 << " sec" << std::endl
+             << std::setw( maxsize ) << total3Key << " : "
+             << std::setw( 12 ) << total3 << " sec" << std::endl
+             << std::setw( maxsize ) << total3aKey << " : "
+             << std::setw( 12 ) << total3a << " sec" << std::endl;
+        ostr << std::defaultfloat; // default format: affects all floats
+      }
+    }
+
+  private:
+
+    Timer<TIMERTYPE> m_timer;
+    std::string m_active;
+    std::map<std::string, float> m_partitionTimers;
+    std::map<std::string, uint32_t> m_partitionIds;
+  };
+
+}
+
+#endif // MGONGPUTIMERMAP_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
new file mode 100644
index 0000000000..d83768a43d
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
@@ -0,0 +1,530 @@
+#include "umami.h"
+
+#include "CPPProcess.h"
+#include "GpuRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <cmath>
+
+#ifdef MGONGPUCPP_GPUIMPL
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+namespace
+{
+
+  void* initialize_impl(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    bool is_good_hel[CPPProcess::ncomb];
+    sigmaKin_getGoodHel(
+      momenta, couplings, matrix_elements, numerators, denominators,
+#ifdef MGONGPUCPP_GPUIMPL
+      color_jamps,
+#endif
+      is_good_hel,
+      count );
+    sigmaKin_setGoodHel( is_good_hel );
+    return nullptr;
+  }
+
+  void initialize(
+    const fptype* momenta,
+    const fptype* couplings,
+    fptype* matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+    fptype* color_jamps,
+#endif
+    fptype* numerators,
+    fptype* denominators,
+    std::size_t count )
+  {
+    // static local initialization is called exactly once in a thread-safe way
+    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+#ifdef MGONGPUCPP_GPUIMPL
+                                          color_jamps,
+#endif
+                                          numerators,
+                                          denominators,
+                                          count );
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__
+#endif
+    void
+    transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride )
+  {
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    std::size_t i_page = i_event / page_size;
+    std::size_t i_vector = i_event % page_size;
+
+    for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part )
+    {
+      for( std::size_t i_mom = 0; i_mom < 4; ++i_mom )
+      {
+        momenta_out[i_page * CPPProcess::npar * 4 * page_size +
+                    i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event];
+      }
+    }
+  }
+
+#ifdef MGONGPUCPP_GPUIMPL
+
+  __global__ void copy_inputs(
+    const double* momenta_in,
+    const double* helicity_random_in,
+    const double* color_random_in,
+    const double* diagram_random_in,
+    const double* alpha_s_in,
+    fptype* momenta,
+    fptype* helicity_random,
+    fptype* color_random,
+    fptype* diagram_random,
+    fptype* g_s,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    transpose_momenta( &momenta_in[offset], momenta, i_event, stride );
+    diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5;
+    helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
+    color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
+    g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+  }
+
+  __global__ void copy_outputs(
+    fptype* denominators,
+    fptype* numerators,
+    fptype* matrix_elements,
+    unsigned int* diagram_index,
+    int* color_index,
+    int* helicity_index,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    std::size_t count,
+    std::size_t stride,
+    std::size_t offset )
+  {
+    std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x;
+    if( i_event >= count ) return;
+
+    if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event];
+    if( amp2_out )
+    {
+      double denominator = denominators[i_event];
+      for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+      {
+        amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator;
+      }
+    }
+    if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+    if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1;
+    if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+  }
+
+#endif // MGONGPUCPP_GPUIMPL
+
+  struct InterfaceInstance
+  {
+    bool initialized = false;
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t hel_streams[CPPProcess::ncomb];
+#endif
+  };
+
+}
+
+extern "C"
+{
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result )
+  {
+    switch( meta_key )
+    {
+      case UMAMI_META_DEVICE:
+      {
+        UmamiDevice& device = *static_cast<UmamiDevice*>( result );
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
+        device = UMAMI_DEVICE_CUDA;
+#elif defined( __HIPCC__ )
+        device = UMAMI_DEVICE_HIP;
+#endif
+#else
+        device = UMAMI_DEVICE_CPU;
+#endif
+        break;
+      }
+      case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::npar;
+        break;
+      case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ndiagrams;
+        break;
+      case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>( result ) = CPPProcess::ncomb;
+        break;
+      case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+      default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path )
+  {
+    CPPProcess process;
+    process.initProc( param_card_path );
+    auto instance = new InterfaceInstance();
+    *handle = instance;
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      gpuStreamCreate( &instance->hel_streams[ihel] );
+    }
+#endif
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag )
+  {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+  }
+
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs )
+  {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr; // TODO: unused
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+    const int* diagram_in = nullptr; // TODO: unused
+
+    for( std::size_t i = 0; i < input_count; ++i )
+    {
+      const void* input = inputs[i];
+      switch( input_keys[i] )
+      {
+        case UMAMI_IN_MOMENTA:
+          momenta_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_ALPHA_S:
+          alpha_s_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_FLAVOR_INDEX:
+          flavor_in = static_cast<const int*>( input );
+          break;
+        case UMAMI_IN_RANDOM_COLOR:
+          random_color_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_HELICITY:
+          random_helicity_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+          random_diagram_in = static_cast<const double*>( input );
+          break;
+        case UMAMI_IN_HELICITY_INDEX:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        case UMAMI_IN_DIAGRAM_INDEX:
+          diagram_in = static_cast<const int*>( input );
+          break;
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_INPUT;
+      }
+    }
+    if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT;
+
+#ifdef MGONGPUCPP_GPUIMPL
+    gpuStream_t gpu_stream = nullptr;
+#endif
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for( std::size_t i = 0; i < output_count; ++i )
+    {
+      void* output = outputs[i];
+      switch( output_keys[i] )
+      {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+          m2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+          amp2_out = static_cast<double*>( output );
+          break;
+        case UMAMI_OUT_COLOR_INDEX:
+          color_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_HELICITY_INDEX:
+          helicity_out = static_cast<int*>( output );
+          break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+          diagram_out = static_cast<int*>( output );
+          break;
+#ifdef MGONGPUCPP_GPUIMPL
+        case UMAMI_OUT_GPU_STREAM:
+          gpu_stream = static_cast<gpuStream_t>( output );
+          break;
+#endif
+        default:
+          return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+      }
+    }
+
+#ifdef MGONGPUCPP_GPUIMPL
+    std::size_t n_threads = 256;
+    std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads;
+    std::size_t rounded_count = n_blocks * n_threads;
+
+    fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
+    fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
+    int *helicity_index, *color_index;
+    unsigned int* diagram_index;
+
+    std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup;
+    gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream );
+    gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream );
+    gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream );
+
+    copy_inputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      momenta_in,
+      random_helicity_in,
+      random_color_in,
+      random_diagram_in,
+      alpha_s_in,
+      momenta,
+      helicity_random,
+      color_random,
+      diagram_random,
+      g_s,
+      count,
+      stride,
+      offset );
+    computeDependentCouplings<<<n_blocks, n_threads, 0, gpu_stream>>>( g_s, couplings );
+    checkGpu( gpuPeekAtLastError() );
+    // TODO: make things fully async (requires using events instead of synchronize in
+    //       the sigmaKin implementation)
+    gpuStreamSynchronize( gpu_stream );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta,
+      couplings,
+      helicity_random,
+      color_random,
+      nullptr,
+      diagram_random,
+      matrix_elements,
+      helicity_index,
+      color_index,
+      color_jamps,
+      numerators,
+      denominators,
+      diagram_index,
+      false,
+      ghel_matrix_elements,
+      ghel_jamps,
+      nullptr,
+      nullptr,
+      instance->hel_streams,
+      n_blocks,
+      n_threads );
+
+    copy_outputs<<<n_blocks, n_threads, 0, gpu_stream>>>(
+      denominators,
+      numerators,
+      matrix_elements,
+      diagram_index,
+      color_index,
+      helicity_index,
+      m2_out,
+      amp2_out,
+      diagram_out,
+      color_out,
+      helicity_out,
+      count,
+      stride,
+      offset );
+    checkGpu( gpuPeekAtLastError() );
+
+    gpuFreeAsync( momenta, gpu_stream );
+    gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( g_s, gpu_stream );
+    gpuFreeAsync( helicity_random, gpu_stream );
+    gpuFreeAsync( color_random, gpu_stream );
+    gpuFreeAsync( diagram_random, gpu_stream );
+    gpuFreeAsync( matrix_elements, gpu_stream );
+    gpuFreeAsync( diagram_index, gpu_stream );
+    gpuFreeAsync( color_jamps, gpu_stream );
+    gpuFreeAsync( numerators, gpu_stream );
+    gpuFreeAsync( denominators, gpu_stream );
+    gpuFreeAsync( helicity_index, gpu_stream );
+    gpuFreeAsync( color_index, gpu_stream );
+    gpuFreeAsync( ghel_matrix_elements, gpu_stream );
+    gpuFreeAsync( ghel_jamps, gpu_stream );
+#else  // MGONGPUCPP_GPUIMPL
+    // need to round to round to double page size for some reason
+    std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM;
+    std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2;
+
+    HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
+    HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_dependentCouplings::ndcoup * 2 );
+    HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<fptype, false> helicity_random( rounded_count );
+    HostBufferBase<fptype, false> color_random( rounded_count );
+    HostBufferBase<fptype, false> diagram_random( rounded_count );
+    HostBufferBase<fptype, false> matrix_elements( rounded_count );
+    HostBufferBase<unsigned int, false> diagram_index( rounded_count );
+    HostBufferBase<fptype, false> numerators( rounded_count * CPPProcess::ndiagrams );
+    HostBufferBase<fptype, false> denominators( rounded_count );
+    HostBufferBase<int, false> helicity_index( rounded_count );
+    HostBufferBase<int, false> color_index( rounded_count );
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride );
+      helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5;
+      color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
+      diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
+      g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    }
+    computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
+
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+    if( !instance->initialized )
+    {
+      initialize(
+        momenta.data(),
+        couplings.data(),
+        matrix_elements.data(),
+        numerators.data(),
+        denominators.data(),
+        rounded_count );
+      instance->initialized = true;
+    }
+
+    sigmaKin(
+      momenta.data(),
+      couplings.data(),
+      helicity_random.data(),
+      color_random.data(),
+      nullptr,
+      diagram_random.data(),
+      matrix_elements.data(),
+      helicity_index.data(),
+      color_index.data(),
+      numerators.data(),
+      denominators.data(),
+      diagram_index.data(),
+      false,
+      rounded_count );
+
+    std::size_t page_size = MemoryAccessMomentaBase::neppM;
+    for( std::size_t i_event = 0; i_event < count; ++i_event )
+    {
+      std::size_t i_page = i_event / page_size;
+      std::size_t i_vector = i_event % page_size;
+
+      double denominator = denominators[i_event];
+      if( m2_out != nullptr )
+      {
+        m2_out[i_event + offset] = matrix_elements[i_event];
+      }
+      if( amp2_out != nullptr )
+      {
+        for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag )
+        {
+          amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator;
+        }
+      }
+      if( diagram_out != nullptr )
+      {
+        diagram_out[i_event + offset] = diagram_index[i_event] - 1;
+      }
+      if( color_out != nullptr )
+      {
+        color_out[i_event + offset] = color_index[i_event] - 1;
+      }
+      if( helicity_out != nullptr )
+      {
+        helicity_out[i_event + offset] = helicity_index[i_event] - 1;
+      }
+    }
+#endif // MGONGPUCPP_GPUIMPL
+    return UMAMI_SUCCESS;
+  }
+
+  UmamiStatus umami_free( UmamiHandle handle )
+  {
+    InterfaceInstance* instance = static_cast<InterfaceInstance*>( handle );
+#ifdef MGONGPUCPP_GPUIMPL
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+      if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] );
+    }
+#endif
+    delete instance;
+    return UMAMI_SUCCESS;
+  }
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h
new file mode 100644
index 0000000000..39ac6fe385
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h
@@ -0,0 +1,212 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * Major version number of the UMAMI interface. If the major version is the same
+   * between caller and implementation, binary compatibility is ensured.
+   */
+  const inline int UMAMI_MAJOR_VERSION = 1;
+
+  /**
+   * Minor version number of the UMAMI interface. Between minor versions, new keys for
+   * errors, devices, metadata, inputs and outputs can be added.
+   */
+  const inline int UMAMI_MINOR_VERSION = 0;
+
+  typedef enum
+  {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+  } UmamiStatus;
+
+  typedef enum
+  {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+  } UmamiDevice;
+
+  typedef enum
+  {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+  } UmamiMetaKey;
+
+  typedef enum
+  {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+  } UmamiInputKey;
+
+  typedef enum
+  {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+  } UmamiOutputKey;
+
+  typedef void* UmamiHandle;
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param meta_key
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result );
+
+  /**
+   * Creates an instance of the matrix element. Each instance is independent, so thread
+   * safety can be achieved by creating a separate one for every thread.
+   *
+   * @param param_card_path
+   *     path to the parameter file
+   * @param handle
+   *     pointer to an instance of the subprocess. Has to be cleaned up by
+   *     the caller with `free_subprocess`.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path );
+
+  /**
+   * Sets the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     real part of the parameter value
+   * @param parameter_imag
+   *     imaginary part of the parameter value. Ignored for real valued parameters.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag );
+
+  /**
+   * Retrieves the value of a model parameter
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param name
+   *     name of the parameter
+   * @param parameter_real
+   *     pointer to double to return real part of the parameter value
+   * @param parameter_imag
+   *     pointer to double to return imaginary part of the parameter value. Ignored
+   *     for real-valued parameters (i.e. you may pass a null pointer)
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag );
+
+  /**
+   * Evaluates the matrix element as a function of the given inputs, filling the
+   * requested outputs.
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   * @param count
+   *     number of events to evaluate the matrix element for
+   * @param stride
+   *     stride of the batch dimension of the input and output arrays, see memory layout
+   * @param offset
+   *     offset of the event index
+   * @param input_count
+   *     number of inputs to the matrix element
+   * @param input_keys
+   *     pointer to an array of input keys, length `input_count`
+   * @param inputs
+   *     pointer to an array of void pointers to the inputs. The type of the inputs
+   *     depends on the input key
+   * @param output_count
+   *     number of outputs to the matrix element
+   * @param output_keys
+   *     pointer to an array of output keys, length `output_count`
+   * @param outputs
+   *     pointer to an array of void pointers to the outputs. The type of the outputs
+   *     depends on the output key. The caller is responsible for allocating memory for
+   *     the outputs.
+   * @return
+   *     UMAMI_SUCCESS on success, error code otherwise
+   */
+  UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs );
+
+  /**
+   * Frees matrix element instance
+   *
+   * @param handle
+   *     handle of a matrix element instance
+   */
+  UmamiStatus umami_free( UmamiHandle handle );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/valgrind.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/valgrind.h
new file mode 100644
index 0000000000..5e610e59ba
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/valgrind.h
@@ -0,0 +1,7170 @@
+/* clang-format off */
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2017 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+
+/* ------------------------------------------------------------------ */
+/* VERSION NUMBER OF VALGRIND                                         */
+/* ------------------------------------------------------------------ */
+
+/* Specify Valgrind's version number, so that user code can
+   conditionally compile based on our version number.  Note that these
+   were introduced at version 3.6 and so do not exist in version 3.5
+   or earlier.  The recommended way to use them to check for "version
+   X.Y or later" is (eg)
+
+#if defined(__VALGRIND_MAJOR__) && defined(__VALGRIND_MINOR__)   \
+    && (__VALGRIND_MAJOR__ > 3                                   \
+        || (__VALGRIND_MAJOR__ == 3 && __VALGRIND_MINOR__ >= 6))
+*/
+#define __VALGRIND_MAJOR__    3
+#define __VALGRIND_MINOR__    23
+
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is).
+
+   Misc note: how to find out what's predefined in gcc by default:
+   gcc -Wp,-dM somefile.c
+*/
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_freebsd
+#undef PLAT_amd64_freebsd
+#undef PLAT_arm64_freebsd
+#undef PLAT_x86_win32
+#undef PLAT_amd64_win64
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64be_linux
+#undef PLAT_ppc64le_linux
+#undef PLAT_arm_linux
+#undef PLAT_arm64_linux
+#undef PLAT_s390x_linux
+#undef PLAT_mips32_linux
+#undef PLAT_mips64_linux
+#undef PLAT_nanomips_linux
+#undef PLAT_x86_solaris
+#undef PLAT_amd64_solaris
+
+
+#if defined(__APPLE__) && defined(__i386__)
+#  define PLAT_x86_darwin 1
+#elif defined(__APPLE__) && defined(__x86_64__)
+#  define PLAT_amd64_darwin 1
+#elif defined(__FreeBSD__) && defined(__i386__)
+#  define PLAT_x86_freebsd 1
+#elif defined(__FreeBSD__) && defined(__amd64__)
+#  define PLAT_amd64_freebsd 1
+#elif defined(__FreeBSD__) && defined(__aarch64__) && !defined(__arm__)
+#  define PLAT_arm64_freebsd 1
+#elif (defined(__MINGW32__) && defined(__i386__)) \
+      || defined(__CYGWIN32__) \
+      || (defined(_WIN32) && defined(_M_IX86))
+#  define PLAT_x86_win32 1
+#elif (defined(__MINGW32__) && defined(__x86_64__)) \
+      || (defined(_WIN32) && defined(_M_X64))
+/* __MINGW32__ and _WIN32 are defined in 64 bit mode as well. */
+#  define PLAT_amd64_win64 1
+#elif defined(__linux__) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif defined(__linux__) && defined(__x86_64__) && !defined(__ILP32__)
+#  define PLAT_amd64_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) && _CALL_ELF != 2
+/* Big Endian uses ELF version 1 */
+#  define PLAT_ppc64be_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) && _CALL_ELF == 2
+/* Little Endian uses ELF version 2 */
+#  define PLAT_ppc64le_linux 1
+#elif defined(__linux__) && defined(__arm__) && !defined(__aarch64__)
+#  define PLAT_arm_linux 1
+#elif defined(__linux__) && defined(__aarch64__) && !defined(__arm__)
+#  define PLAT_arm64_linux 1
+#elif defined(__linux__) && defined(__s390__) && defined(__s390x__)
+#  define PLAT_s390x_linux 1
+#elif defined(__linux__) && defined(__mips__) && (__mips==64)
+#  define PLAT_mips64_linux 1
+#elif defined(__linux__) && defined(__mips__) && (__mips==32)
+#  define PLAT_mips32_linux 1
+#elif defined(__linux__) && defined(__nanomips__)
+#  define PLAT_nanomips_linux 1
+#elif defined(__sun) && defined(__i386__)
+#  define PLAT_x86_solaris 1
+#elif defined(__sun) && defined(__x86_64__)
+#  define PLAT_amd64_solaris 1
+#else
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+/*
+ * VALGRIND_DO_CLIENT_REQUEST(): a statement that invokes a Valgrind client
+ * request. Accepts both pointers and integers as arguments.
+ *
+ * VALGRIND_DO_CLIENT_REQUEST_STMT(): a statement that invokes a Valgrind
+ * client request that does not return a value.
+
+ * VALGRIND_DO_CLIENT_REQUEST_EXPR(): a C expression that invokes a Valgrind
+ * client request and whose value equals the client request result.  Accepts
+ * both pointers and integers as arguments.  Note that such calls are not
+ * necessarily pure functions -- they may have side effects.
+ */
+
+#define VALGRIND_DO_CLIENT_REQUEST(_zzq_rlval, _zzq_default,            \
+                                   _zzq_request, _zzq_arg1, _zzq_arg2,  \
+                                   _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  do { (_zzq_rlval) = VALGRIND_DO_CLIENT_REQUEST_EXPR((_zzq_default),   \
+                        (_zzq_request), (_zzq_arg1), (_zzq_arg2),       \
+                        (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#define VALGRIND_DO_CLIENT_REQUEST_STMT(_zzq_request, _zzq_arg1,        \
+                           _zzq_arg2,  _zzq_arg3, _zzq_arg4, _zzq_arg5) \
+  do { (void) VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                        \
+                    (_zzq_request), (_zzq_arg1), (_zzq_arg2),           \
+                    (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+      (_zzq_default)
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ----------------- x86-{linux,darwin,solaris} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin)  \
+    ||  (defined(PLAT_x86_win32) && defined(__GNUC__)) \
+    ||  defined(PLAT_x86_solaris) || defined(PLAT_x86_freebsd)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                   \
+  ({volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "xchgl %%edi,%%edi\n\t"                     \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_x86_linux || PLAT_x86_darwin || (PLAT_x86_win32 && __GNUC__)
+          || PLAT_x86_solaris */
+
+/* ------------------------- x86-Win32 ------------------------- */
+
+#if defined(PLAT_x86_win32) && !defined(__GNUC__)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#if defined(_MSC_VER)
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     __asm rol edi, 3  __asm rol edi, 13          \
+                     __asm rol edi, 29 __asm rol edi, 19
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    valgrind_do_client_request_expr((uintptr_t)(_zzq_default),    \
+        (uintptr_t)(_zzq_request), (uintptr_t)(_zzq_arg1),        \
+        (uintptr_t)(_zzq_arg2), (uintptr_t)(_zzq_arg3),           \
+        (uintptr_t)(_zzq_arg4), (uintptr_t)(_zzq_arg5))
+
+static __inline uintptr_t
+valgrind_do_client_request_expr(uintptr_t _zzq_default, uintptr_t _zzq_request,
+                                uintptr_t _zzq_arg1, uintptr_t _zzq_arg2,
+                                uintptr_t _zzq_arg3, uintptr_t _zzq_arg4,
+                                uintptr_t _zzq_arg5)
+{
+    volatile uintptr_t _zzq_args[6];
+    volatile unsigned int _zzq_result;
+    _zzq_args[0] = (uintptr_t)(_zzq_request);
+    _zzq_args[1] = (uintptr_t)(_zzq_arg1);
+    _zzq_args[2] = (uintptr_t)(_zzq_arg2);
+    _zzq_args[3] = (uintptr_t)(_zzq_arg3);
+    _zzq_args[4] = (uintptr_t)(_zzq_arg4);
+    _zzq_args[5] = (uintptr_t)(_zzq_arg5);
+    __asm { __asm lea eax, _zzq_args __asm mov edx, _zzq_default
+            __SPECIAL_INSTRUCTION_PREAMBLE
+            /* %EDX = client_request ( %EAX ) */
+            __asm xchg ebx,ebx
+            __asm mov _zzq_result, edx
+    }
+    return _zzq_result;
+}
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm { __SPECIAL_INSTRUCTION_PREAMBLE                        \
+            /* %EAX = guest_NRADDR */                             \
+            __asm xchg ecx,ecx                                    \
+            __asm mov __addr, eax                                 \
+    }                                                             \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX ERROR
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm { __SPECIAL_INSTRUCTION_PREAMBLE                       \
+            __asm xchg edi,edi                                   \
+    }                                                            \
+ } while (0)
+
+#else
+#error Unsupported compiler.
+#endif
+
+#endif /* PLAT_x86_win32 */
+
+/* ----------------- amd64-{linux,darwin,solaris} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin) \
+    ||  defined(PLAT_amd64_solaris) \
+    ||  defined(PLAT_amd64_freebsd) \
+    ||  (defined(PLAT_amd64_win64) && defined(__GNUC__))
+
+typedef
+   struct { 
+      unsigned long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    __extension__                                                 \
+    ({ volatile unsigned long int _zzq_args[6];                   \
+    volatile unsigned long int _zzq_result;                       \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long int __addr;                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "xchgq %%rdi,%%rdi\n\t"                     \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin || PLAT_amd64_solaris */
+
+/* ------------------------- amd64-Win64 ------------------------- */
+
+#if defined(PLAT_amd64_win64) && !defined(__GNUC__)
+
+#error Unsupported compiler.
+
+#endif /* PLAT_amd64_win64 */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                    "rlwinm 0,0,3,0,31  ; rlwinm 0,0,13,0,31\n\t" \
+                    "rlwinm 0,0,29,0,31 ; rlwinm 0,0,19,0,31\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+    __extension__                                                 \
+  ({         unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64be_linux)
+
+typedef
+   struct { 
+      unsigned long int nraddr; /* where's the code? */
+      unsigned long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long int  _zzq_args[6];                     \
+             unsigned long int  _zzq_result;                      \
+             unsigned long int* _zzq_ptr;                         \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long int __addr;                                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc64be_linux */
+
+#if defined(PLAT_ppc64le_linux)
+
+typedef
+   struct {
+      unsigned long int nraddr; /* where's the code? */
+      unsigned long int r2;     /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long int  _zzq_args[6];                     \
+             unsigned long int  _zzq_result;                      \
+             unsigned long int* _zzq_ptr;                         \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long int __addr;                                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R12 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc64le_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+            "mov r12, r12, ror #3  ; mov r12, r12, ror #13 \n\t"  \
+            "mov r12, r12, ror #29 ; mov r12, r12, ror #19 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({volatile unsigned int  _zzq_args[6];                          \
+    volatile unsigned int  _zzq_result;                           \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile("mov r3, %1\n\t" /*default*/                 \
+                     "mov r4, %2\n\t" /*ptr*/                     \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = client_request ( R4 ) */             \
+                     "orr r10, r10, r10\n\t"                      \
+                     "mov %0, r3"     /*result*/                  \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "cc","memory", "r3", "r4");                \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = guest_NRADDR */                      \
+                     "orr r11, r11, r11\n\t"                      \
+                     "mov %0, r3"                                 \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R4 */        \
+                     "orr r12, r12, r12\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "orr r9, r9, r9\n\t"                        \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------ arm64-{linux,freebsd} ------------------------- */
+
+#if defined(PLAT_arm64_linux) || defined(PLAT_arm64_freebsd)
+
+typedef
+   struct { 
+      unsigned long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+            "ror x12, x12, #3  ;  ror x12, x12, #13 \n\t"         \
+            "ror x12, x12, #51 ;  ror x12, x12, #61 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({volatile unsigned long int  _zzq_args[6];                     \
+    volatile unsigned long int  _zzq_result;                      \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    __asm__ volatile("mov x3, %1\n\t" /*default*/                 \
+                     "mov x4, %2\n\t" /*ptr*/                     \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* X3 = client_request ( X4 ) */             \
+                     "orr x10, x10, x10\n\t"                      \
+                     "mov %0, x3"     /*result*/                  \
+                     : "=r" (_zzq_result)                         \
+                     : "r" ((unsigned long int)(_zzq_default)),   \
+                       "r" (&_zzq_args[0])                        \
+                     : "cc","memory", "x3", "x4");                \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long int __addr;                                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* X3 = guest_NRADDR */                      \
+                     "orr x11, x11, x11\n\t"                      \
+                     "mov %0, x3"                                 \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "x3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir X8 */          \
+                     "orr x12, x12, x12\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "orr x9, x9, x9\n\t"                        \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_arm64_linux || PLAT_arm64_freebsd */
+
+/* ------------------------ s390x-linux ------------------------ */
+
+#if defined(PLAT_s390x_linux)
+
+typedef
+  struct {
+     unsigned long int nraddr; /* where's the code? */
+  }
+  OrigFn;
+
+/* __SPECIAL_INSTRUCTION_PREAMBLE will be used to identify Valgrind specific
+ * code. This detection is implemented in platform specific toIR.c
+ * (e.g. VEX/priv/guest_s390_decoder.c).
+ */
+#define __SPECIAL_INSTRUCTION_PREAMBLE                           \
+                     "lr 15,15\n\t"                              \
+                     "lr 1,1\n\t"                                \
+                     "lr 2,2\n\t"                                \
+                     "lr 3,3\n\t"
+
+#define __CLIENT_REQUEST_CODE "lr 2,2\n\t"
+#define __GET_NR_CONTEXT_CODE "lr 3,3\n\t"
+#define __CALL_NO_REDIR_CODE  "lr 4,4\n\t"
+#define __VEX_INJECT_IR_CODE  "lr 5,5\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                         \
+       _zzq_default, _zzq_request,                               \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                  \
+ ({volatile unsigned long int _zzq_args[6];                      \
+   volatile unsigned long int _zzq_result;                       \
+   _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+   _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+   _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+   _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+   _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+   _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+   __asm__ volatile(/* r2 = args */                              \
+                    "lgr 2,%1\n\t"                               \
+                    /* r3 = default */                           \
+                    "lgr 3,%2\n\t"                               \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CLIENT_REQUEST_CODE                        \
+                    /* results = r3 */                           \
+                    "lgr %0, 3\n\t"                              \
+                    : "=d" (_zzq_result)                         \
+                    : "a" (&_zzq_args[0]),                       \
+                      "0" ((unsigned long int)_zzq_default)      \
+                    : "cc", "2", "3", "memory"                   \
+                   );                                            \
+   _zzq_result;                                                  \
+ })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                      \
+ { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+   volatile unsigned long int __addr;                            \
+   __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __GET_NR_CONTEXT_CODE                        \
+                    "lgr %0, 3\n\t"                              \
+                    : "=a" (__addr)                              \
+                    :                                            \
+                    : "cc", "3", "memory"                        \
+                   );                                            \
+   _zzq_orig->nraddr = __addr;                                   \
+ }
+
+#define VALGRIND_CALL_NOREDIR_R1                                 \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CALL_NO_REDIR_CODE
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     __VEX_INJECT_IR_CODE);                      \
+ } while (0)
+
+#endif /* PLAT_s390x_linux */
+
+/* ------------------------- mips32-linux ---------------- */
+
+#if defined(PLAT_mips32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+/* .word  0x342
+ * .word  0x742
+ * .word  0xC2
+ * .word  0x4C2*/
+#define __SPECIAL_INSTRUCTION_PREAMBLE          \
+                     "srl $0, $0, 13\n\t"       \
+                     "srl $0, $0, 29\n\t"       \
+                     "srl $0, $0, 3\n\t"        \
+                     "srl $0, $0, 19\n\t"
+                    
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+       _zzq_default, _zzq_request,                                \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  __extension__                                                   \
+  ({ volatile unsigned int _zzq_args[6];                          \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+        __asm__ volatile("move $11, %1\n\t" /*default*/           \
+                     "move $12, %2\n\t" /*ptr*/                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* T3 = client_request ( T4 ) */             \
+                     "or $13, $13, $13\n\t"                       \
+                     "move %0, $11\n\t"     /*result*/            \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "$11", "$12", "memory");                   \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %t9 = guest_NRADDR */                     \
+                     "or $14, $14, $14\n\t"                       \
+                     "move %0, $11"     /*result*/                \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "$11"                                      \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_T9                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE              \
+                     /* call-noredir *%t9 */                     \
+                     "or $15, $15, $15\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or $11, $11, $11\n\t"                      \
+                    );                                           \
+ } while (0)
+
+
+#endif /* PLAT_mips32_linux */
+
+/* ------------------------- mips64-linux ---------------- */
+
+#if defined(PLAT_mips64_linux)
+
+typedef
+   struct {
+      unsigned long nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+/* dsll $0,$0, 3
+ * dsll $0,$0, 13
+ * dsll $0,$0, 29
+ * dsll $0,$0, 19*/
+#define __SPECIAL_INSTRUCTION_PREAMBLE                              \
+                     "dsll $0,$0, 3 ; dsll $0,$0,13\n\t"            \
+                     "dsll $0,$0,29 ; dsll $0,$0,19\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                            \
+       _zzq_default, _zzq_request,                                  \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)       \
+  __extension__                                                     \
+  ({ volatile unsigned long int _zzq_args[6];                       \
+    volatile unsigned long int _zzq_result;                         \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);               \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                  \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                  \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                  \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                  \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                  \
+        __asm__ volatile("move $11, %1\n\t" /*default*/             \
+                         "move $12, %2\n\t" /*ptr*/                 \
+                         __SPECIAL_INSTRUCTION_PREAMBLE             \
+                         /* $11 = client_request ( $12 ) */         \
+                         "or $13, $13, $13\n\t"                     \
+                         "move %0, $11\n\t"     /*result*/          \
+                         : "=r" (_zzq_result)                       \
+                         : "r" (_zzq_default), "r" (&_zzq_args[0])  \
+                         : "$11", "$12", "memory");                 \
+    _zzq_result;                                                    \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                         \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                     \
+    volatile unsigned long int __addr;                              \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* $11 = guest_NRADDR */                       \
+                     "or $14, $14, $14\n\t"                         \
+                     "move %0, $11"     /*result*/                  \
+                     : "=r" (__addr)                                \
+                     :                                              \
+                     : "$11");                                      \
+    _zzq_orig->nraddr = __addr;                                     \
+  }
+
+#define VALGRIND_CALL_NOREDIR_T9                                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* call-noredir $25 */                         \
+                     "or $15, $15, $15\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                    \
+ do {                                                               \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     "or $11, $11, $11\n\t"                         \
+                    );                                              \
+ } while (0)
+
+#endif /* PLAT_mips64_linux */
+
+#if defined(PLAT_nanomips_linux)
+
+typedef
+   struct {
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+/*
+   8000 c04d  srl  zero, zero, 13
+   8000 c05d  srl  zero, zero, 29
+   8000 c043  srl  zero, zero,  3
+   8000 c053  srl  zero, zero, 19
+*/
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE "srl[32] $zero, $zero, 13 \n\t" \
+                                       "srl[32] $zero, $zero, 29 \n\t" \
+                                       "srl[32] $zero, $zero, 3  \n\t" \
+                                       "srl[32] $zero, $zero, 19 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+       _zzq_default, _zzq_request,                                \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  __extension__                                                   \
+  ({ volatile unsigned int _zzq_args[6];                          \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile("move $a7, %1\n\t" /* default */             \
+                     "move $t0, %2\n\t" /* ptr */                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* $a7 = client_request( $t0 ) */            \
+                     "or[32] $t0, $t0, $t0\n\t"                   \
+                     "move %0, $a7\n\t"     /* result */          \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "$a7", "$t0", "memory");                   \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                         \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                     \
+    volatile unsigned long int __addr;                              \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* $a7 = guest_NRADDR */                       \
+                     "or[32] $t1, $t1, $t1\n\t"                     \
+                     "move %0, $a7"     /*result*/                  \
+                     : "=r" (__addr)                                \
+                     :                                              \
+                     : "$a7");                                      \
+    _zzq_orig->nraddr = __addr;                                     \
+  }
+
+#define VALGRIND_CALL_NOREDIR_T9                                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* call-noredir $25 */                         \
+                     "or[32] $t2, $t2, $t2\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                    \
+ do {                                                               \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     "or[32] $t3, $t3, $t3\n\t"                     \
+                    );                                              \
+ } while (0)
+
+#endif
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h.  NOTE also: inserts
+   the default behaviour equivalance class tag "0000" into the name.
+   See pub_tool_redir.h for details -- normally you don't need to
+   think about this, though. */
+
+/* Use an extra level of macroisation so as to ensure the soname/fnname
+   args are fully macro-expanded before pasting them together. */
+#define VG_CONCAT4(_aa,_bb,_cc,_dd) _aa##_bb##_cc##_dd
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZU_,soname,_,fnname)
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZZ_,soname,_,fnname)
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Also provide end-user facilities for function replacement, rather
+   than wrapping.  A replacement function differs from a wrapper in
+   that it has no way to get hold of the original function being
+   called, and hence no way to call onwards to it.  In a replacement
+   function, VALGRIND_GET_ORIG_FN always returns zero. */
+
+#define I_REPLACE_SONAME_FNNAME_ZU(soname,fnname)                 \
+   VG_CONCAT4(_vgr00000ZU_,soname,_,fnname)
+
+#define I_REPLACE_SONAME_FNNAME_ZZ(soname,fnname)                 \
+   VG_CONCAT4(_vgr00000ZZ_,soname,_,fnname)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+#define CALL_FN_v_WWWW(fnptr, arg1,arg2,arg3,arg4)                \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWWW(_junk,fnptr,arg1,arg2,arg3,arg4); } while (0)
+
+#define CALL_FN_v_5W(fnptr, arg1,arg2,arg3,arg4,arg5)             \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_5W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5); } while (0)
+
+#define CALL_FN_v_6W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6)        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_6W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6); } while (0)
+
+#define CALL_FN_v_7W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6,arg7)   \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_7W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6,arg7); } while (0)
+
+/* ----------------- x86-{linux,darwin,solaris} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin) \
+    ||  defined(PLAT_x86_solaris)  || defined(PLAT_x86_freebsd)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "movl %%esp,%%edi\n\t"               \
+      "andl $0xfffffff0,%%esp\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "movl %%edi,%%esp\n\t"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux || PLAT_x86_darwin || PLAT_x86_solaris */
+
+/* ---------------- amd64-{linux,darwin,solaris} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin) \
+    ||  defined(PLAT_amd64_solaris)  ||  defined(PLAT_amd64_freebsd)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* This is all pretty complex.  It's so as to make stack unwinding
+   work reliably.  See bug 243270.  The basic problem is the sub and
+   add of 128 of %rsp in all of the following macros.  If gcc believes
+   the CFA is in %rsp, then unwinding may fail, because what's at the
+   CFA is not what gcc "expected" when it constructs the CFIs for the
+   places where the macros are instantiated.
+
+   But we can't just add a CFI annotation to increase the CFA offset
+   by 128, to match the sub of 128 from %rsp, because we don't know
+   whether gcc has chosen %rsp as the CFA at that point, or whether it
+   has chosen some other register (eg, %rbp).  In the latter case,
+   adding a CFI annotation to change the CFA offset is simply wrong.
+
+   So the solution is to get hold of the CFA using
+   __builtin_dwarf_cfa(), put it in a known register, and add a
+   CFI annotation to say what the register is.  We choose %rbp for
+   this (perhaps perversely), because:
+
+   (1) %rbp is already subject to unwinding.  If a new register was
+       chosen then the unwinder would have to unwind it in all stack
+       traces, which is expensive, and
+
+   (2) %rbp is already subject to precise exception updates in the
+       JIT.  If a new register was chosen, we'd have to have precise
+       exceptions for it too, which reduces performance of the
+       generated code.
+
+   However .. one extra complication.  We can't just whack the result
+   of __builtin_dwarf_cfa() into %rbp and then add %rbp to the
+   list of trashed registers at the end of the inline assembly
+   fragments; gcc won't allow %rbp to appear in that list.  Hence
+   instead we need to stash %rbp in %r15 for the duration of the asm,
+   and say that %r15 is trashed instead.  gcc seems happy to go with
+   that.
+
+   Oh .. and this all needs to be conditionalised so that it is
+   unchanged from before this commit, when compiled with older gccs
+   that don't support __builtin_dwarf_cfa.  Furthermore, since
+   this header file is freestanding, it has to be independent of
+   config.h, and so the following conditionalisation cannot depend on
+   configure time checks.
+
+   Although it's not clear from
+   'defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)',
+   this expression excludes Darwin.
+   .cfi directives in Darwin assembly appear to be completely
+   different and I haven't investigated how they work.
+
+   For even more entertainment value, note we have to use the
+   completely undocumented __builtin_dwarf_cfa(), which appears to
+   really compute the CFA, whereas __builtin_frame_address(0) claims
+   to but actually doesn't.  See
+   https://bugs.kde.org/show_bug.cgi?id=243270#c47
+*/
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"r"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "movq %%rbp, %%r15\n\t"                                     \
+      "movq %2, %%rbp\n\t"                                        \
+      ".cfi_remember_state\n\t"                                   \
+      ".cfi_def_cfa rbp, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "movq %%r15, %%rbp\n\t"                                     \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "movq %%rsp,%%r14\n\t"               \
+      "andq $0xfffffffffffffff0,%%rsp\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "movq %%r14,%%rsp\n\t"
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                        \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[1];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                                  \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[2];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                            \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[3];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                      \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[4];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)                \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[5];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)             \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[6];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)        \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[7];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,        \
+                                 arg7)                                 \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[8];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $136,%%rsp\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,        \
+                                 arg7,arg8)                            \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[9];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,        \
+                                 arg7,arg8,arg9)                       \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[10];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $136,%%rsp\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,       \
+                                  arg7,arg8,arg9,arg10)                \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[11];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      _argvec[10] = (unsigned long)(arg10);                            \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "pushq 80(%%rax)\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,       \
+                                  arg7,arg8,arg9,arg10,arg11)          \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[12];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      _argvec[10] = (unsigned long)(arg10);                            \
+      _argvec[11] = (unsigned long)(arg11);                            \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $136,%%rsp\n\t"                                         \
+         "pushq 88(%%rax)\n\t"                                         \
+         "pushq 80(%%rax)\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,       \
+                                arg7,arg8,arg9,arg10,arg11,arg12)      \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[13];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      _argvec[10] = (unsigned long)(arg10);                            \
+      _argvec[11] = (unsigned long)(arg11);                            \
+      _argvec[12] = (unsigned long)(arg12);                            \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "pushq 96(%%rax)\n\t"                                         \
+         "pushq 88(%%rax)\n\t"                                         \
+         "pushq 80(%%rax)\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin || PLAT_amd64_solaris */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rlwinm 1,1,0,0,27\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64be_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",         \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rldicr 1,1,0,59\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64be_linux */
+
+/* ------------------------- ppc64le-linux ----------------------- */
+#if defined(PLAT_ppc64le_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",         \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rldicr 1,1,0,59\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(12)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(12)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(12)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64le_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "r0", "r1", "r2", "r3","r4", "r12", "r14"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+/* This is a bit tricky.  We store the original stack pointer in r10
+   as it is callee-saves.  gcc doesn't allow the use of r11 for some
+   reason.  Also, we can't directly "bic" the stack pointer in thumb
+   mode since r13 isn't an allowed register number in that context.
+   So use r4 as a temporary, since that is about to get trashed
+   anyway, just after each use of this macro.  Side effect is we need
+   to be very careful about any future changes, since
+   VALGRIND_ALIGN_STACK simply assumes r4 is usable. */
+#define VALGRIND_ALIGN_STACK               \
+      "mov r10, sp\n\t"                    \
+      "mov r4,  sp\n\t"                    \
+      "bic r4,  r4, #7\n\t"                \
+      "mov sp,  r4\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mov sp,  r10\n\t"
+
+/* These CALL_FN_ macros assume that on arm-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "push {r0, r1, r2, r3} \n\t"                             \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "ldr r2, [%1, #48] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------ arm64-linux ------------------------ */
+
+#if defined(PLAT_arm64_linux) || defined(PLAT_arm64_freebsd)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS \
+     "x0", "x1", "x2", "x3","x4", "x5", "x6", "x7", "x8", "x9",   \
+     "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",      \
+     "x18", "x19", "x20", "x30",                                  \
+     "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",  \
+     "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",      \
+     "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",      \
+     "v26", "v27", "v28", "v29", "v30", "v31"
+
+/* x21 is callee-saved, so we can use it to save and restore SP around
+   the hidden call. */
+#define VALGRIND_ALIGN_STACK               \
+      "mov x21, sp\n\t"                    \
+      "bic sp, x21, #15\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mov sp,  x21\n\t"
+
+/* These CALL_FN_ macros assume that on arm64-linux,
+   sizeof(unsigned long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x20 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x20 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1, #80] \n\t"                                 \
+         "str x8, [sp, #8]  \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x30 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1, #80] \n\t"                                 \
+         "str x8, [sp, #8]  \n\t"                                 \
+         "ldr x8, [%1, #88] \n\t"                                 \
+         "str x8, [sp, #16] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11,     \
+                                  arg12)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x30 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1, #80] \n\t"                                 \
+         "str x8, [sp, #8]  \n\t"                                 \
+         "ldr x8, [%1, #88] \n\t"                                 \
+         "str x8, [sp, #16] \n\t"                                 \
+         "ldr x8, [%1, #96] \n\t"                                 \
+         "str x8, [sp, #24] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_arm64_linux */
+
+/* ------------------------- s390x-linux ------------------------- */
+
+#if defined(PLAT_s390x_linux)
+
+/* Similar workaround as amd64 (see above), but we use r11 as frame
+   pointer and save the old r11 in r7. r11 might be used for
+   argvec, therefore we copy argvec in r1 since r1 is clobbered
+   after the call anyway.  */
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"d"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      ".cfi_remember_state\n\t"                                   \
+      "lgr 1,%1\n\t" /* copy the argvec pointer in r1 */          \
+      "lgr 7,11\n\t"                                              \
+      "lgr 11,%2\n\t"                                             \
+      ".cfi_def_cfa 11, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "lgr 11, 7\n\t"                                             \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "lgr 1,%1\n\t"
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+/* Nb: On s390 the stack pointer is properly aligned *at all times*
+   according to the s390 GCC maintainer. (The ABI specification is not
+   precise in this regard.) Therefore, VALGRIND_ALIGN_STACK and
+   VALGRIND_RESTORE_STACK are not defined here. */
+
+/* These regs are trashed by the hidden call. Note that we overwrite
+   r14 in s390_irgen_noredir (VEX/priv/guest_s390_irgen.c) to give the
+   function a proper return address. All others are ABI defined call
+   clobbers. */
+#if defined(__VX__) || defined(__S390_VX__)
+#define __CALLER_SAVED_REGS "0", "1", "2", "3", "4", "5", "14",   \
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",             \
+      "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",       \
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",     \
+      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+#else
+#define __CALLER_SAVED_REGS "0", "1", "2", "3", "4", "5", "14",   \
+      "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7"
+#endif
+
+/* Nb: Although r11 is modified in the asm snippets below (inside 
+   VALGRIND_CFI_PROLOGUE) it is not listed in the clobber section, for
+   two reasons:
+   (1) r11 is restored in VALGRIND_CFI_EPILOGUE, so effectively it is not
+       modified
+   (2) GCC will complain that r11 cannot appear inside a clobber section,
+       when compiled with -O -fno-omit-frame-pointer
+ */
+
+#define CALL_FN_W_v(lval, orig)                                  \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long  _argvec[1];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 1, 0(1)\n\t"  /* target->r1 */                      \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "d" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+/* The call abi has the arguments in r2-r6 and stack */
+#define CALL_FN_W_W(lval, orig, arg1)                            \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[2];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1, arg2)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[3];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1, arg2, arg3)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[4];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1, arg2, arg3, arg4)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[5];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1, arg2, arg3, arg4, arg5)   \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[6];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6)                                       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[7];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-168\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,168\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7)                                 \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[8];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-176\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,176\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8)                           \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[9];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-184\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,184\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8, arg9)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[10];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-192\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,192\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[11];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-200\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,200\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[12];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-208\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,208\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11, arg12)\
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[13];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      _argvec[12] = (unsigned long)arg12;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-216\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "mvc 208(8,15), 96(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,216\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+
+#endif /* PLAT_s390x_linux */
+
+/* ------------------------- mips32-linux ----------------------- */
+ 
+#if defined(PLAT_mips32_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "$2", "$3", "$4", "$5", "$6",       \
+"$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
+"$25", "$31"
+
+/* These CALL_FN_ macros assume that on mips-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16\n\t"                                  \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+     volatile unsigned long _argvec[2];                           \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"   /* arg1*/                          \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory",  __CALLER_SAVED_REGS               \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 24\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 24 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 32\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "nop\n\t"                                                \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 32 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 32\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 32 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 40\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 40 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 40\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 40 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 48\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 40(%1) \n\t"                                     \
+         "sw $4, 36($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 48 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 48\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 40(%1) \n\t"                                     \
+         "sw $4, 36($29) \n\t"                                    \
+         "lw $4, 44(%1) \n\t"                                     \
+         "sw $4, 40($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 48 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 56\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 40(%1) \n\t"                                     \
+         "sw $4, 36($29) \n\t"                                    \
+         "lw $4, 44(%1) \n\t"                                     \
+         "sw $4, 40($29) \n\t"                                    \
+         "lw $4, 48(%1) \n\t"                                     \
+         "sw $4, 44($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 56 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_mips32_linux */
+
+/* ------------------------- nanomips-linux -------------------- */
+
+#if defined(PLAT_nanomips_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "$t4", "$t5", "$a0", "$a1", "$a2",     \
+"$a3", "$a4", "$a5", "$a6", "$a7", "$t0", "$t1", "$t2", "$t3",     \
+"$t8","$t9", "$at"
+
+/* These CALL_FN_ macros assume that on mips-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         "lw $a5,24(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         "lw $a5,24(%1)\n\t"                                      \
+         "lw $a6,28(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         "lw $a5,24(%1)\n\t"                                      \
+         "lw $a6,28(%1)\n\t"                                      \
+         "lw $a7,32(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9,40(%1)        \n\t"                              \
+         "sw $t9, 4($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9,40(%1)        \n\t"                              \
+         "sw $t9, 4($sp)       \n\t"                              \
+         "lw $t9,44(%1)        \n\t"                              \
+         "sw $t9, 8($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9,40(%1)        \n\t"                              \
+         "sw $t9, 4($sp)       \n\t"                              \
+         "lw $t9,44(%1)        \n\t"                              \
+         "sw $t9, 8($sp)       \n\t"                              \
+         "lw $t9,48(%1)        \n\t"                              \
+         "sw $t9,12($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_nanomips_linux */
+
+/* ------------------------- mips64-linux ------------------------- */
+
+#if defined(PLAT_mips64_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "$2", "$3", "$4", "$5", "$6",       \
+"$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
+"$25", "$31"
+
+/* These CALL_FN_ macros assume that on mips64-linux,
+   sizeof(long long) == 8. */
+
+#define MIPS64_LONG2REG_CAST(x) ((long long)(long)x)
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[1];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      __asm__ volatile(                                           \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[2];                     \
+      volatile unsigned long long  _res;                          \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"   /* arg1*/                           \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[3];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = _orig.nraddr;                                  \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[4];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = _orig.nraddr;                                  \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[5];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[6];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[7];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[8];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[9];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[10];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 8\n\t"                                  \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 8\n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[11];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      _argvec[10] = MIPS64_LONG2REG_CAST(arg10);                  \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 16\n\t"                                 \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 80(%1)\n\t"                                      \
+         "sd $4, 8($29)\n\t"                                      \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 16\n\t"                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[12];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      _argvec[10] = MIPS64_LONG2REG_CAST(arg10);                  \
+      _argvec[11] = MIPS64_LONG2REG_CAST(arg11);                  \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 24\n\t"                                 \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 80(%1)\n\t"                                      \
+         "sd $4, 8($29)\n\t"                                      \
+         "ld $4, 88(%1)\n\t"                                      \
+         "sd $4, 16($29)\n\t"                                     \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 24\n\t"                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[13];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      _argvec[10] = MIPS64_LONG2REG_CAST(arg10);                  \
+      _argvec[11] = MIPS64_LONG2REG_CAST(arg11);                  \
+      _argvec[12] = MIPS64_LONG2REG_CAST(arg12);                  \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 32\n\t"                                 \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 80(%1)\n\t"                                      \
+         "sd $4, 8($29)\n\t"                                      \
+         "ld $4, 88(%1)\n\t"                                      \
+         "sd $4, 16($29)\n\t"                                     \
+         "ld $4, 96(%1)\n\t"                                      \
+         "sd $4, 24($29)\n\t"                                     \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 32\n\t"                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#endif /* PLAT_mips64_linux */
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE NUMERIC VALUES OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end of the most
+   relevant group. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* Allows the client program and/or gdbserver to execute a monitor
+             command. */
+          VG_USERREQ__GDB_MONITOR_COMMAND = 0x1202,
+
+          /* Allows the client program to change a dynamic command line
+             option.  */
+          VG_USERREQ__CLO_CHANGE = 0x1203,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__RESIZEINPLACE_BLOCK = 0x130b,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          /* The first two pass the va_list argument by value, which
+             assumes it is the same size as or smaller than a UWord,
+             which generally isn't the case.  Hence are deprecated.
+             The second two pass the vargs by reference and so are
+             immune to this problem. */
+          /* both :: char* fmt, va_list vargs (DEPRECATED) */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+          /* both :: char* fmt, va_list* vargs */
+          VG_USERREQ__PRINTF_VALIST_BY_REF = 0x1403,
+          VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF = 0x1404,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503,
+
+          /* Wine support */
+          VG_USERREQ__LOAD_PDB_DEBUGINFO = 0x1601,
+
+          /* Querying of debug info. */
+          VG_USERREQ__MAP_IP_TO_SRCLOC = 0x1701,
+
+          /* Disable/enable error reporting level.  Takes a single
+             Word arg which is the delta to this thread's error
+             disablement indicator.  Hence 1 disables or further
+             disables errors, and -1 moves back towards enablement.
+             Other values are not allowed. */
+          VG_USERREQ__CHANGE_ERR_DISABLEMENT = 0x1801,
+
+          /* Some requests used for Valgrind internal, such as
+             self-test or self-hosting. */
+          /* Initialise IR injection */
+          VG_USERREQ__VEX_INIT_FOR_IRI = 0x1901,
+          /* Used by Inner Valgrind to inform Outer Valgrind where to
+             find the list of inner guest threads */
+          VG_USERREQ__INNER_THREADS    = 0x1902
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND                                           \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* if not */,         \
+                                    VG_USERREQ__RUNNING_ON_VALGRIND,  \
+                                    0, 0, 0, 0, 0)                    \
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                                    _qzz_addr, _qzz_len, 0, 0, 0)
+
+#define VALGRIND_INNER_THREADS(_qzz_addr)                               \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__INNER_THREADS,           \
+                                   _qzz_addr, 0, 0, 0, 0)
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack.  The return value
+   is the number of characters printed, excluding the "**<pid>** " part at the
+   start and the backtrace (if present). */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) && !defined(_MSC_VER)
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   (void)format;
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) && !defined(_MSC_VER)
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   (void)format;
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitrary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,       \
+                                    VG_USERREQ__CLIENT_CALL0,     \
+                                    _qyy_fn,                      \
+                                    0, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)                    \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL1,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)         \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL2,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, _qyy_arg2, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,             \
+                                    VG_USERREQ__CLIENT_CALL3,           \
+                                    _qyy_fn,                            \
+                                    _qyy_arg1, _qyy_arg2,               \
+                                    _qyy_arg3, 0)
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(                    \
+                               0 /* default return */,            \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0)
+
+/* Several Valgrind tools (Memcheck, Massif, Helgrind, DRD) rely on knowing
+   when heap blocks are allocated in order to give accurate results.  This
+   happens automatically for the standard allocator functions such as
+   malloc(), calloc(), realloc(), memalign(), new, new[], free(), delete,
+   delete[], etc.
+
+   But if your program uses a custom allocator, this doesn't automatically
+   happen, and Valgrind will not do as well.  For example, if you allocate
+   superblocks with mmap() and then allocates chunks of the superblocks, all
+   Valgrind's observations will be at the mmap() level and it won't know that
+   the chunks should be considered separate entities.  In Memcheck's case,
+   that means you probably won't get heap block overrun detection (because
+   there won't be redzones marked as unaddressable) and you definitely won't
+   get any leak detection.
+
+   The following client requests allow a custom allocator to be annotated so
+   that it can be handled accurately by Valgrind.
+
+   VALGRIND_MALLOCLIKE_BLOCK marks a region of memory as having been allocated
+   by a malloc()-like function.  For Memcheck (an illustrative case), this
+   does two things:
+
+   - It records that the block has been allocated.  This means any addresses
+     within the block mentioned in error messages will be
+     identified as belonging to the block.  It also means that if the block
+     isn't freed it will be detected by the leak checker.
+
+   - It marks the block as being addressable and undefined (if 'is_zeroed' is
+     not set), or addressable and defined (if 'is_zeroed' is set).  This
+     controls how accesses to the block by the program are handled.
+   
+   'addr' is the start of the usable block (ie. after any
+   redzone), 'sizeB' is its size.  'rzB' is the redzone size if the allocator
+   can apply redzones -- these are blocks of padding at the start and end of
+   each block.  Adding redzones is recommended as it makes it much more likely
+   Valgrind will spot block overruns.  `is_zeroed' indicates if the memory is
+   zeroed (or filled with another predictable value), as is the case for
+   calloc().
+   
+   VALGRIND_MALLOCLIKE_BLOCK should be put immediately after the point where a
+   heap block -- that will be used by the client program -- is allocated.
+   It's best to put it at the outermost level of the allocator if possible;
+   for example, if you have a function my_alloc() which calls
+   internal_alloc(), and the client request is put inside internal_alloc(),
+   stack traces relating to the heap block will contain entries for both
+   my_alloc() and internal_alloc(), which is probably not what you want.
+
+   For Memcheck users: if you use VALGRIND_MALLOCLIKE_BLOCK to carve out
+   custom blocks from within a heap block, B, that has been allocated with
+   malloc/calloc/new/etc, then block B will be *ignored* during leak-checking
+   -- the custom blocks will take precedence.
+
+   VALGRIND_FREELIKE_BLOCK is the partner to VALGRIND_MALLOCLIKE_BLOCK.  For
+   Memcheck, it does two things:
+
+   - It records that the block has been deallocated.  This assumes that the
+     block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - It marks the block as being unaddressable.
+
+   VALGRIND_FREELIKE_BLOCK should be put immediately after the point where a
+   heap block is deallocated.
+
+   VALGRIND_RESIZEINPLACE_BLOCK informs a tool about reallocation. For
+   Memcheck, it does four things:
+
+   - It records that the size of a block has been changed.  This assumes that
+     the block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - If the block shrunk, it marks the freed memory as being unaddressable.
+
+   - If the block grew, it marks the new area as undefined and defines a red
+     zone past the end of the new block.
+
+   - The V-bits of the overlap between the old and the new block are preserved.
+
+   VALGRIND_RESIZEINPLACE_BLOCK should be put after allocation of the new block
+   and before deallocation of the old block.
+
+   In many cases, these three client requests will not be enough to get your
+   allocator working well with Memcheck.  More specifically, if your allocator
+   writes to freed blocks in any way then a VALGRIND_MAKE_MEM_UNDEFINED call
+   will be necessary to mark the memory as addressable just before the zeroing
+   occurs, otherwise you'll get a lot of invalid write errors.  For example,
+   you'll need to do this if your allocator recycles freed blocks, but it
+   zeroes them before handing them back out (via VALGRIND_MALLOCLIKE_BLOCK).
+   Alternatively, if your allocator reuses freed blocks for allocator-internal
+   data structures, VALGRIND_MAKE_MEM_UNDEFINED calls will also be necessary.
+
+   Really, what's happening is a blurring of the lines between the client
+   program and the allocator... after VALGRIND_FREELIKE_BLOCK is called, the
+   memory should be considered unaddressable to the client program, but the
+   allocator knows more than the rest of the client program and so may be able
+   to safely access it.  Extra client requests are necessary for Valgrind to
+   understand the distinction between the allocator and the rest of the
+   program.
+
+   Ignored if addr == 0.
+*/
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)          \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MALLOCLIKE_BLOCK,       \
+                                    addr, sizeB, rzB, is_zeroed, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__RESIZEINPLACE_BLOCK,    \
+                                    addr, oldSizeB, newSizeB, rzB, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__FREELIKE_BLOCK,         \
+                                    addr, rzB, 0, 0, 0)
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CREATE_MEMPOOL,   \
+                                    pool, rzB, is_zeroed, 0, 0)
+
+/* Create a memory pool with some flags specifying extended behaviour.
+   When flags is zero, the behaviour is identical to VALGRIND_CREATE_MEMPOOL.
+   
+   The flag VALGRIND_MEMPOOL_METAPOOL specifies that the pieces of memory 
+   associated with the pool using VALGRIND_MEMPOOL_ALLOC  will be used
+   by the application as superblocks to dole out MALLOC_LIKE blocks using
+   VALGRIND_MALLOCLIKE_BLOCK. In other words, a meta pool is a "2 levels"
+   pool : first level is the blocks described by VALGRIND_MEMPOOL_ALLOC.
+   The second level blocks are described using VALGRIND_MALLOCLIKE_BLOCK.
+   Note that the association between the pool and the second level blocks
+   is implicit : second level blocks will be located inside first level
+   blocks. It is necessary to use the VALGRIND_MEMPOOL_METAPOOL flag
+   for such 2 levels pools, as otherwise valgrind will detect overlapping
+   memory blocks, and will abort execution (e.g. during leak search).
+
+   Such a meta pool can also be marked as an 'auto free' pool using the flag
+   VALGRIND_MEMPOOL_AUTO_FREE, which must be OR-ed together with the
+   VALGRIND_MEMPOOL_METAPOOL. For an 'auto free' pool, VALGRIND_MEMPOOL_FREE
+   will automatically free the second level blocks that are contained
+   inside the first level block freed with VALGRIND_MEMPOOL_FREE.
+   In other words, calling VALGRIND_MEMPOOL_FREE will cause implicit calls
+   to VALGRIND_FREELIKE_BLOCK for all the second level blocks included
+   in the first level block.
+   Note: it is an error to use the VALGRIND_MEMPOOL_AUTO_FREE flag
+   without the VALGRIND_MEMPOOL_METAPOOL flag.
+*/
+#define VALGRIND_MEMPOOL_AUTO_FREE  1
+#define VALGRIND_MEMPOOL_METAPOOL   2
+#define VALGRIND_CREATE_MEMPOOL_EXT(pool, rzB, is_zeroed, flags)        \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CREATE_MEMPOOL,          \
+                                   pool, rzB, is_zeroed, flags, 0)
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DESTROY_MEMPOOL,  \
+                                    pool, 0, 0, 0, 0)
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_ALLOC,    \
+                                    pool, addr, size, 0, 0)
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_FREE,     \
+                                    pool, addr, 0, 0, 0)
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_TRIM,     \
+                                    pool, addr, size, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MOVE_MEMPOOL,     \
+                                    poolA, poolB, 0, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_CHANGE,   \
+                                    pool, addrA, addrB, size, 0)
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0)
+
+/* Mark a piece of memory as being a stack. Returns a stack id.
+   start is the lowest addressable stack byte, end is the highest
+   addressable stack byte. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0)
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_DEREGISTER, \
+                                    id, 0, 0, 0, 0)
+
+/* Change the start and end address of the stack id.
+   start is the new lowest addressable stack byte, end is the new highest
+   addressable stack byte. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_CHANGE,     \
+                                    id, start, end, 0, 0)
+
+/* Load PDB debug info for Wine PE image_map. */
+#define VALGRIND_LOAD_PDB_DEBUGINFO(fd, ptr, total_size, delta)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__LOAD_PDB_DEBUGINFO, \
+                                    fd, ptr, total_size, delta, 0)
+
+/* Map a code address to a source file name and line number.  buf64
+   must point to a 64-byte buffer in the caller's address space.  The
+   result will be dumped in there and is guaranteed to be zero
+   terminated.  If no info is found, the first byte is set to zero. */
+#define VALGRIND_MAP_IP_TO_SRCLOC(addr, buf64)                    \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MAP_IP_TO_SRCLOC,      \
+                               addr, buf64, 0, 0, 0)
+
+/* Disable error reporting for this thread.  Behaves in a stack like
+   way, so you can safely call this multiple times provided that
+   VALGRIND_ENABLE_ERROR_REPORTING is called the same number of times
+   to re-enable reporting.  The first call of this macro disables
+   reporting.  Subsequent calls have no effect except to increase the
+   number of VALGRIND_ENABLE_ERROR_REPORTING calls needed to re-enable
+   reporting.  Child threads do not inherit this setting from their
+   parents -- they are always created with reporting enabled. */
+#define VALGRIND_DISABLE_ERROR_REPORTING                                \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    1, 0, 0, 0, 0)
+
+/* Re-enable error reporting, as per comments on
+   VALGRIND_DISABLE_ERROR_REPORTING. */
+#define VALGRIND_ENABLE_ERROR_REPORTING                                 \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    -1, 0, 0, 0, 0)
+
+/* Execute a monitor command from the client program.
+   If a connection is opened with GDB, the output will be sent
+   according to the output mode set for vgdb.
+   If no connection is opened, output will go to the log output.
+   Returns 1 if command not recognised, 0 otherwise. */
+#define VALGRIND_MONITOR_COMMAND(command)                               \
+   VALGRIND_DO_CLIENT_REQUEST_EXPR(0, VG_USERREQ__GDB_MONITOR_COMMAND, \
+                                   command, 0, 0, 0, 0)
+
+
+/* Change the value of a dynamic command line option.
+   Note that unknown or not dynamically changeable options
+   will cause a warning message to be output.  */
+#define VALGRIND_CLO_CHANGE(option)                           \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CLO_CHANGE, \
+                                   option, 0, 0, 0, 0)
+
+
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_amd64_win64
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64be_linux
+#undef PLAT_ppc64le_linux
+#undef PLAT_arm_linux
+#undef PLAT_s390x_linux
+#undef PLAT_mips32_linux
+#undef PLAT_mips64_linux
+#undef PLAT_nanomips_linux
+#undef PLAT_x86_solaris
+#undef PLAT_amd64_solaris
+
+#endif   /* __VALGRIND_H */
+/* clang-format on */
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/madevent_makefile_source_addon b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/madevent_makefile_source_addon
new file mode 100644
index 0000000000..011573d7ab
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/madevent_makefile_source_addon
@@ -0,0 +1,4 @@
+cleanavxs: # Clean builds: fortran in all P*; cudacpp for all AVX in all P* and in src
+	for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
+
+cleanall: cleanSource cleanavxs # Clean builds: fortran in this Source and in all P*; cudacpp for all AVX in all P* and in src
\ No newline at end of file
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.cc
new file mode 100644
index 0000000000..932d405605
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.cc
@@ -0,0 +1,249 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+
+#include "read_slha.h"
+
+#include <limits.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdlib>
+//#ifdef __HIPCC__
+//#include <experimental/filesystem> // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79
+//#else
+//#include <filesystem> // bypass this completely to ease portability on LUMI #803
+//#endif
+#include <fstream>
+#include <iostream>
+
+void
+SLHABlock::set_entry( std::vector<int> indices, double value )
+{
+  if( _entries.size() == 0 )
+    _indices = indices.size();
+  else if( indices.size() != _indices )
+    throw "Wrong number of indices in set_entry";
+
+  _entries[indices] = value;
+}
+
+double
+SLHABlock::get_entry( std::vector<int> indices, double def_val )
+{
+  if( _entries.find( indices ) == _entries.end() )
+  {
+    std::cout << "Warning: No such entry in " << _name << ", using default value "
+              << def_val << std::endl;
+    return def_val;
+  }
+  return _entries[indices];
+}
+
+std::string
+SLHAReader::get_exe_path() // see https://cplusplus.com/forum/general/11104
+{
+  char result[PATH_MAX];
+  ssize_t count = readlink( "/proc/self/exe", result, PATH_MAX );
+  return std::string( result, ( count > 0 ) ? count : 0 );
+}
+
+void
+SLHAReader::read_slha_file( std::string file_name, bool verbose )
+{
+  const char envpath[] = "MG5AMC_CARD_PATH";
+  std::ifstream param_card;
+  param_card.open( file_name.c_str(), std::ifstream::in );
+  if( param_card.good() )
+  {
+    if( verbose ) std::cout << "Opened slha file " << file_name << " for reading" << std::endl;
+  }
+  else if( getenv( envpath ) )
+  {
+    std::cout << "WARNING! Card file '" << file_name << "' does not exist:"
+              << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl;
+    /*
+#ifdef __HIPCC__
+    const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename();
+#else
+    const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename();
+#endif
+    */
+    const std::size_t foundsep = file_name.find_last_of( "/" ); // strip dirname from file_name #923
+    const std::string base_name = ( foundsep != std::string::npos ? file_name.substr( foundsep + 1 ) : file_name );
+    const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + base_name; // bypass std::filesystem #803
+    param_card.open( file_name2.c_str(), std::ifstream::in );
+    if( param_card.good() )
+    {
+      std::cout << "Opened slha file " << file_name2 << " for reading" << std::endl;
+    }
+    else
+    {
+      std::cout << "ERROR! Card file '" << file_name2 << "' does not exist" << std::endl;
+      throw "Error while opening param card";
+    }
+  }
+  else
+  {
+    const std::string exepath = SLHAReader::get_exe_path();
+    if( exepath != "" )
+    {
+      std::cout << "WARNING! Card file '" << file_name << "' does not exist:"
+                << " and environment variable '" << envpath << "' is not set:"
+                << " look for the file in a path relative to the executable path '" << exepath << "'" << std::endl;
+
+      const std::size_t foundsep = exepath.find_last_of( "/" );
+      const std::string exedir = ( foundsep != std::string::npos ? exepath.substr( 0, foundsep ) : std::string( "." ) );
+      const std::string file_name2 = exedir + "/" + file_name;
+      param_card.open( file_name2.c_str(), std::ifstream::in );
+      if( param_card.good() )
+      {
+        std::cout << "Opened slha file " << file_name2 << " for reading" << std::endl;
+      }
+      else
+      {
+        std::cout << "WARNING! Card file '" << file_name2 << "' does not exist: try one level higher" << std::endl;
+      }
+      const std::string file_name3 = exedir + "/../" + file_name;
+      param_card.open( file_name3.c_str(), std::ifstream::in );
+      if( param_card.good() )
+      {
+        std::cout << "Opened slha file " << file_name3 << " for reading" << std::endl;
+      }
+      else
+      {
+        std::cout << "ERROR! Card file '" << file_name3 << "' does not exist" << std::endl;
+        throw "Error while opening param card";
+      }
+    }
+    else
+    {
+      std::cout << "ERROR! Card file '" << file_name << "' does not exist"
+                << " and environment variable '" << envpath << "' is not set"
+                << " and executable path cannot be determined" << std::endl;
+      throw "Error while opening param card";
+    }
+  }
+  char buf[200];
+  std::string line;
+  std::string block( "" );
+  while( param_card.good() )
+  {
+    param_card.getline( buf, 200 );
+    line = buf;
+    // Change to lowercase
+    transform( line.begin(), line.end(), line.begin(), (int ( * )( int ))tolower );
+    if( line != "" && line[0] != '#' )
+    {
+      if( block != "" )
+      {
+        // Look for double index blocks
+        double dindex1, dindex2;
+        double value;
+        std::stringstream linestr2( line );
+        if( linestr2 >> dindex1 >> dindex2 >> value &&
+            dindex1 == int( dindex1 ) and dindex2 == int( dindex2 ) )
+        {
+          std::vector<int> indices;
+          indices.push_back( int( dindex1 ) );
+          indices.push_back( int( dindex2 ) );
+          set_block_entry( block, indices, value );
+          // Done with this line, read next
+          continue;
+        }
+        std::stringstream linestr1( line );
+        // Look for single index blocks
+        if( linestr1 >> dindex1 >> value && dindex1 == int( dindex1 ) )
+        {
+          std::vector<int> indices;
+          indices.push_back( int( dindex1 ) );
+          set_block_entry( block, indices, value );
+          // Done with this line, read next
+          continue;
+        }
+      }
+      // Look for block
+      if( line.find( "block " ) != line.npos )
+      {
+        line = line.substr( 6 );
+        // Get rid of spaces between block and block name
+        while( line[0] == ' ' )
+          line = line.substr( 1 );
+        // Now find end of block name
+        size_t space_pos = line.find( ' ' );
+        if( space_pos != std::string::npos )
+          line = line.substr( 0, space_pos );
+        block = line;
+        continue;
+      }
+      // Look for decay
+      if( line.find( "decay " ) == 0 )
+      {
+        line = line.substr( 6 );
+        block = "";
+        std::stringstream linestr( line );
+        int pdg_code;
+        double value;
+        if( linestr >> pdg_code >> value )
+          set_block_entry( "decay", pdg_code, value );
+        else
+          std::cout << "Warning: Wrong format for decay block " << line << std::endl;
+        continue;
+      }
+    }
+  }
+  if( _blocks.size() == 0 )
+    throw "No information read from SLHA card";
+
+  param_card.close();
+}
+
+double
+SLHAReader::get_block_entry( std::string block_name, std::vector<int> indices, double def_val )
+{
+  if( _blocks.find( block_name ) == _blocks.end() )
+  {
+    std::cout << "No such block " << block_name << ", using default value "
+              << def_val << std::endl;
+    return def_val;
+  }
+  return _blocks[block_name].get_entry( indices );
+}
+
+double
+SLHAReader::get_block_entry( std::string block_name, int index, double def_val )
+{
+  std::vector<int> indices;
+  indices.push_back( index );
+  return get_block_entry( block_name, indices, def_val );
+}
+
+void
+SLHAReader::set_block_entry( std::string block_name, std::vector<int> indices, double value )
+{
+  if( _blocks.find( block_name ) == _blocks.end() )
+  {
+    SLHABlock block( block_name );
+    _blocks[block_name] = block;
+  }
+  _blocks[block_name].set_entry( indices, value );
+  /*
+  cout << "Set block " << block_name << " entry ";
+  for (int i=0;i < indices.size();i++)
+    cout << indices[i] << " ";
+  cout << "to " << _blocks[block_name].get_entry(indices) << endl;
+  */
+}
+
+void
+SLHAReader::set_block_entry( std::string block_name, int index, double value )
+{
+  std::vector<int> indices;
+  indices.push_back( index );
+  set_block_entry( block_name, indices, value );
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.h
new file mode 100644
index 0000000000..9ec62fd526
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/read_slha.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
+// Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
+//==========================================================================
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+//==========================================================================
+
+#ifndef READ_SLHA_H
+#define READ_SLHA_H 1
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+class SLHABlock
+{
+public:
+  SLHABlock( std::string name = "" ) { _name = name; }
+  ~SLHABlock() {}
+  void set_entry( std::vector<int> indices, double value );
+  double get_entry( std::vector<int> indices, double def_val = 0 );
+  void set_name( std::string name ) { _name = name; }
+  std::string get_name() { return _name; }
+  unsigned int get_indices() { return _indices; }
+private:
+  std::string _name;
+  std::map<std::vector<int>, double> _entries;
+  unsigned int _indices;
+};
+
+class SLHAReader
+{
+public:
+  SLHAReader( std::string file_name = "", bool verbose = true )
+  {
+    if( file_name != "" ) read_slha_file( file_name, verbose );
+  }
+  static std::string get_exe_path();
+  void read_slha_file( std::string file_name, bool verbose );
+  double get_block_entry( std::string block_name, std::vector<int> indices, double def_val = 0 );
+  double get_block_entry( std::string block_name, int index, double def_val = 0 );
+  void set_block_entry( std::string block_name, std::vector<int> indices, double value );
+  void set_block_entry( std::string block_name, int index, double value );
+private:
+  std::map<std::string, SLHABlock> _blocks;
+};
+
+#endif
diff --git a/PLUGIN/CUDACPP_OUTPUT/model_handling.py b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
new file mode 100644
index 0000000000..bc961f6d60
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
@@ -0,0 +1,2554 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
+
+import os
+import sys
+
+import math
+import re
+
+# AV - PLUGIN_NAME can be one of PLUGIN/CUDACPP_OUTPUT or MG5aMC_PLUGIN/CUDACPP_OUTPUT
+PLUGIN_NAME = __name__.rsplit('.',1)[0]
+
+# AV - use templates for source code, scripts and Makefiles from PLUGINDIR instead of MG5DIR
+###from madgraph import MG5DIR
+PLUGINDIR = os.path.dirname( __file__ )
+
+# AV - create a plugin-specific logger
+import logging
+logger = logging.getLogger('madgraph.%s.model_handling'%PLUGIN_NAME)
+
+#------------------------------------------------------------------------------------
+
+# AV - import the independent 2nd copy of the export_cpp module (as PLUGIN_export_cpp), previously loaded in output.py
+###import madgraph.iolibs.export_cpp as export_cpp # 1st copy
+######import madgraph.iolibs.export_cpp as PLUGIN_export_cpp # this is not enough to define an independent 2nd copy: id(export_cpp)==id(PLUGIN_export_cpp)
+######import PLUGIN.CUDACPP_OUTPUT.PLUGIN_export_cpp as PLUGIN_export_cpp # 2nd copy loaded in the plugin's output.py (but not enough for MG5aMC_PLUGIN case)
+__import__('%s.PLUGIN_export_cpp'%PLUGIN_NAME)
+PLUGIN_export_cpp = sys.modules['%s.PLUGIN_export_cpp'%PLUGIN_NAME] # 2nd copy loaded in the plugin's output.py (modified for MG5aMC_PLUGIN case)
+###print('id(export_cpp)=%s'%id(export_cpp))
+###print('id(PLUGIN_export_cpp)=%s'%id(PLUGIN_export_cpp))
+
+#------------------------------------------------------------------------------------
+
+# AV - modify export_cpp.get_mg5_info_lines (replace '# ' by '//')
+def PLUGIN_get_mg5_info_lines():
+    return DEFAULT_get_mg5_info_lines().replace('# ','//')
+
+DEFAULT_get_mg5_info_lines = PLUGIN_export_cpp.get_mg5_info_lines
+PLUGIN_export_cpp.get_mg5_info_lines = PLUGIN_get_mg5_info_lines
+
+#------------------------------------------------------------------------------------
+
+# AV - load an independent 2nd copy of the writers module (as PLUGIN_writers) and use that within the plugin (workaround for #341)
+# See https://stackoverflow.com/a/11285504
+###import madgraph.iolibs.file_writers as writers # 1st copy
+import importlib.util
+SPEC_WRITERS = importlib.util.find_spec('madgraph.iolibs.file_writers')
+PLUGIN_writers = importlib.util.module_from_spec(SPEC_WRITERS)
+SPEC_WRITERS.loader.exec_module(PLUGIN_writers)
+###sys.modules['%s.PLUGIN_writers'%PLUGIN_NAME] = PLUGIN_writers # would allow 'import <PLUGIN_NAME>.PLUGIN_writers' (not needed)
+del SPEC_WRITERS
+
+# AV - use the independent 2nd copy of the writers module within the PLUGIN_export_cpp module (workaround for #341)
+###DEFAULT_writers = PLUGIN_export_cpp.writers # not needed
+PLUGIN_export_cpp.writers = PLUGIN_writers
+
+#------------------------------------------------------------------------------------
+
+# AV - modify writers.FileWriter.__init__ (add a debug printout)
+def PLUGIN_FileWriter__init__( self, name, opt = 'w' ):
+    print( 'FileWriter %s for %s'%( type(self), name) )
+    return DEFAULT_FileWriter__init__( self, name, opt )
+
+DEFAULT_FileWriter__init__ = PLUGIN_writers.FileWriter.__init__
+PLUGIN_writers.FileWriter.__init__ = PLUGIN_FileWriter__init__
+
+#------------------------------------------------------------------------------------
+
+# AV - replace writers.CPPWriter by PLUGIN_CPPWriter (remove formatting)
+class PLUGIN_CPPWriter(PLUGIN_writers.FileWriter):
+    """Custom CPPWriter based on the default FileWriter with minimal modifications"""
+
+DEFAULT_CPPWriter = PLUGIN_writers.CPPWriter
+###PLUGIN_writers.CPPWriter = DEFAULT_CPPWriter # WITH FORMATTING
+PLUGIN_writers.CPPWriter = PLUGIN_CPPWriter # WITHOUT FORMATTING
+
+#------------------------------------------------------------------------------------
+
+import aloha
+import aloha.aloha_writers as aloha_writers
+
+from collections import defaultdict
+from fractions import Fraction
+from six import StringIO
+
+# AV - define a custom ALOHAWriter
+# (NB: enable this via PLUGIN_UFOModelConverter.aloha_writer)
+class PLUGIN_ALOHAWriter(aloha_writers.ALOHAWriterForGPU):
+    # Class structure information
+    #  - object
+    #  - WriteALOHA(object) [in aloha/aloha_writers.py]
+    #  - ALOHAWriterForCPP(WriteALOHA) [in aloha/aloha_writers.py]
+    #  - ALOHAWriterForGPU(ALOHAWriterForCPP) [in aloha/aloha_writers.py]
+    #  - PLUGIN_ALOHAWriter(ALOHAWriterForGPU)
+    #      This class
+
+    # AV - keep defaults from aloha_writers.ALOHAWriterForGPU
+    ###extension = '.cu'
+    ###prefix ='__device__'
+    type2def = aloha_writers.ALOHAWriterForGPU.type2def
+
+    # AV - modify C++ code from aloha_writers.ALOHAWriterForGPU
+    ###ci_definition = 'cxtype cI = cxtype(0., 1.);\n'
+    ci_definition = 'const cxtype cI = cxmake( 0., 1. );\n'
+    ###realoperator = '.real()'
+    ###imagoperator = '.imag()'
+    realoperator = 'cxreal' # NB now a function
+    imagoperator = 'cximag' # NB now a function
+
+    # AV - improve formatting
+    ###type2def['int'] = 'int '
+    type2def['int'] = 'int'
+    ###type2def['double'] = 'fptype '
+    type2def['double'] = 'fptype'
+    ###type2def['complex'] = 'cxtype '
+    type2def['complex'] = 'cxtype'
+
+    # AV - add vector types
+    type2def['double_v'] = 'fptype_sv'
+    type2def['complex_v'] = 'cxtype_sv'
+
+    type2def['aloha_ref'] = '&'
+
+    # AV - modify C++ code from aloha_writers.ALOHAWriterForGPU
+    # AV new option: declare C++ variable type only when they are defined?
+    ###nodeclare = False # old behaviour (separate declaration with no initialization)
+    nodeclare = True # new behaviour (delayed declaration with initialisation)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.outname = 'w%s%s' % (self.particles[self.outgoing-1], self.outgoing)
+        self.momentum_size = 0 # for ALOHAOBJ implementation the momentum is separated from the wavefunctions
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting)
+    def change_number_format(self, number):
+        """Formatting the number"""
+        def isinteger(x):
+            try:
+                return int(x) == x
+            except TypeError:
+                return False
+        if isinteger(number):
+            if number == 1: out = 'one' # AV
+            elif number == -1: out = '-one' # AV
+            elif number == 2: out = 'two' # AV
+            elif number == -2: out = '-two' # AV
+            else: out = '%s.' % (str(int(number))) # This prints -1 as '-1.'
+        elif isinstance(number, complex):
+            if number.imag:
+                if number.real:
+                    out = '( %s + %s * cI )' % (self.change_number_format(number.real), \
+                                    self.change_number_format(number.imag))
+                else:
+                    if number.imag == 1:
+                        out = 'cI'
+                    elif number.imag == -1:
+                        out = '-cI'
+                    else:
+                        out = '( %s * cI )' % self.change_number_format(number.imag)
+            else:
+                out = '%s' % (self.change_number_format(number.real))
+        else:
+            tmp = Fraction(str(number))
+            tmp = tmp.limit_denominator(100)
+            if not abs(tmp - number) / abs(tmp + number) < 1e-8: out = '%.9f' % (number)
+            elif tmp.numerator == 1 and tmp.denominator == 2 : out = 'half' # AV
+            elif tmp.numerator == -1 and tmp.denominator == 2 : out = '-half' # AV
+            elif tmp.numerator == 1 and tmp.denominator == 4 : out = 'quarter' # AV
+            elif tmp.numerator == -1 and tmp.denominator == 4 : out = '-quarter' # AV
+            else: out = '%s./%s.' % (tmp.numerator, tmp.denominator)
+        return out
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting)
+    # [NB: this exists in ALOHAWriterForGPU but essentially falls back to ALOHAWriterForCPP]
+    # [NB: no, actually this exists twice(!) in ForGPU and the 2nd version is not trivial! but I keep the ForCPP version]
+    # This affects HelAmps_sm.h and HelAmps_sm.cc
+    def get_header_txt(self, name=None, couplings=None,mode=''):
+        """Define the Header of the fortran file. This include
+            - function tag
+            - definition of variable
+        """
+        if name is None:
+            name = self.name
+        if mode=='':
+            mode = self.mode
+        out = StringIO()
+        # define the type of function and argument
+        if not 'no_include' in mode:
+            out.write('#include \"%s.h\"\n\n' % self.name)
+        args = []
+        comment_inputs = [] # AV
+        for format, argname in self.define_argument_list(couplings):
+            if format.startswith('list'):
+                type = self.type2def[format[5:]] # double or complex (instead of list_double or list_complex)
+                comment_inputs.append('%s[6]'%argname) # AV (wavefuncsize=6 is hardcoded also in export_cpp...)
+                ###if not argname.startswith('COUP'): type = self.type2def[format[5:]+'_v'] # AV vectorize (double_v or complex_v)
+                if not argname.startswith('COUP'):
+                    type = self.type2def['double'] # AV from cxtype_sv to fptype
+                    argname = 'all'+argname
+                list_arg = '[]'
+            else:
+                type = self.type2def[format] + ' ' + self.type2def['aloha_ref']
+                list_arg = ''
+            misc.sprint(argname,self.tag)
+            if argname.startswith('COUP'):
+                type = self.type2def['double'] # AV from cxtype_sv to fptype array (running alphas #373)
+                if 'M' in self.tag:
+                    type = 'FLV_COUPLING_VIEW'
+                    argname = argname.replace('COUP','MCOUP')
+                    list_arg = ""
+                    point = self.type2def['aloha_ref']
+                else:
+                    argname = 'all'+argname # AV from cxtype_sv to fptype array (running alphas #373)
+                    list_arg = '[]' # AV from cxtype_sv to fptype array (running alphas #373)
+                    point = self.type2def['pointer_coup']
+                args.append('%s %s%s%s'% (type, point, argname, list_arg))
+                coeff_n = re.search(r"\d?+$", argname).group()
+                args.append('double Ccoeff%s'% coeff_n) # OM for 'unary minus' #628
+            else:
+                args.append('%s %s%s'% (type, argname, list_arg))
+        if not self.offshell:
+            ###output = '%(doublec)s%(pointer_vertex)s allvertexes' % {
+            ###    'doublec': self.type2def['double'],
+            ###    'pointer_vertex': self.type2def['pointer_vertex']}
+            output = '%(doublec)s allvertexes[]' % {
+                'doublec': self.type2def['double']}
+            comment_output = 'amplitude \'vertex\''
+            template = 'template<class W_ACCESS, class A_ACCESS, class C_ACCESS>'
+        else:
+            alohatype = 'aloha%s' % self.particles[self.outgoing -1]
+            output = '%(doublec)s %(aloha_ref)s %(spin)s%(id)d' % {
+                     'doublec': self.type2def[alohatype],
+                     'spin': self.particles[self.outgoing -1],
+                     'aloha_ref': self.type2def['aloha_ref'], 
+                     'id': self.outgoing}
+            ###self.declaration.add(('list_complex', output)) # AV BUG FIX - THIS IS NOT NEEDED AND IS WRONG (adds name 'cxtype_sv V3[]')
+            comment_output = 'wavefunction \'%s%d[6]\'' % ( self.particles[self.outgoing -1], self.outgoing ) # AV (wavefuncsize=6)
+            template = 'template<class W_ACCESS, class C_ACCESS>'
+        comment = '// Compute the output %s from the input wavefunctions %s' % ( comment_output, ', '.join(comment_inputs) ) # AV
+        indent = ' ' * len( '  %s( ' % name )
+        out.write('  %(comment)s\n  %(template)s\n  %(prefix)s void\n  %(name)s( const %(args)s,\n%(indent)s%(output)s )%(suffix)s' %
+                  {'comment': comment, # AV - add comment
+                   'template': template, # AV - add template
+                   'prefix': self.prefix + ( ' INLINE' if 'is_h' in mode else '' ), # AV - add INLINE
+                   'suffix': ( ' ALWAYS_INLINE' if 'is_h' in mode else '' ), # AV - add ALWAYS_INLINE
+                   'indent':indent, 'output':output, 'name': name,
+                   'args': (',\n' + indent + 'const ').join(args)}) # AV - add const, add indent
+        if 'is_h' in mode:
+            out.write(';\n')
+            out.write('\n  //--------------------------------------------------------------------------\n') # AV add footer
+        else:
+            ###out.write('\n{\n')
+            out.write('\n  {\n') # AV
+        return out.getvalue()
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting)
+    # This affects HelAmps_sm.cc
+    def get_foot_txt(self):
+        """Prototype for language specific footer"""
+        ###return '}\n'
+        return '  }\n\n  //--------------------------------------------------------------------------' # AV
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting)
+    # This affects HelAmps_sm.cc
+    def get_declaration_txt(self, add_i=True):
+        """ Prototype for how to write the declaration of variable
+            Include the symmetry line (entry FFV_2)
+        """
+        out = StringIO()
+        out.write('    mgDebug( 0, __FUNCTION__ );\n') # AV
+        ###argument_var = [name for type,name in self.call_arg] # UNUSED
+        for type, name in self.call_arg:
+            ###out.write('    %s %s;\n' % ( type, name ) ) # FOR DEBUGGING
+            if type.startswith('aloha'):
+                out.write('    const cxtype_sv* w%s = W_ACCESS::kernelAccessConst( %s.w );\n' % ( name, name ) )
+            if name.startswith('COUP'): # AV from cxtype_sv to fptype array (running alphas #373)
+                if 'M' in self.tag:
+                    out.write('    cxtype_sv %s;\n' % name )
+                else:
+                    out.write('    const cxtype_sv %s = C_ACCESS::kernelAccessConst( all%s );\n' % ( name, name ) )
+        if not self.offshell:
+            vname = 'vertex'
+            access = 'A_ACCESS'
+            allvname = 'allvertexes'
+        else:
+            vname = '%(spin)s%(id)d' % { 'spin': self.particles[self.outgoing -1], 'id': self.outgoing }
+            access = 'W_ACCESS'
+            allvname = vname+".w"
+            vname = "w" + vname
+        out.write('    cxtype_sv* %s = %s::kernelAccess( %s );\n' % ( vname, access, allvname ) )
+        # define the complex number CI = 0+1j
+        if add_i:
+            ###out.write(self.ci_definition)
+            out.write('    ' + self.ci_definition) # AV
+        codedict = {} # AV allow delayed declaration with initialisation
+        for type, name in self.declaration.tolist():
+            ###print(name) # FOR DEBUGGING
+            ###out.write('    %s %s;\n' % ( type, name ) ) # FOR DEBUGGING
+            if type.startswith('list'):
+                type = type[5:]
+                if name.startswith('P'):
+                    size = 4
+                elif not 'tmp' in name:
+                    continue # should be defined in the header
+                elif name[0] in ['F','V']:
+                    if aloha.loop_mode:
+                        size = 8
+                    else:
+                        size = 6
+                elif name[0] == 'S':
+                    if aloha.loop_mode:
+                        size = 5
+                    else:
+                        size = 3
+                elif name[0] in ['R','T']:
+                    if aloha.loop_mode:
+                        size = 20
+                    else:
+                        size = 18
+                fullname = '%s[%s]'%(name, size) # AV
+            elif (type, name) not in self.call_arg:
+                fullname = name # AV
+            else:
+                continue # AV no need to declare the variable
+            if fullname.startswith('OM') :
+                codedict[fullname] = '%s %s' % (self.type2def[type], fullname) # AV UGLY HACK (OM3 is always a scalar)
+            else:
+                codedict[fullname] = '%s %s' % (self.type2def[type+'_v'], fullname) # AV vectorize, add to codedict
+            ###print(fullname, codedict[fullname]) # FOR DEBUGGING
+            if self.nodeclare:
+                self.declaration.codedict = codedict # AV new behaviour (delayed declaration with initialisation)
+            else:
+                out.write('    %s;\n' % codedict[fullname] ) # AV old behaviour (separate declaration with no initialization)
+        ###out.write('    // END DECLARATION\n') # FOR DEBUGGING
+        return out.getvalue()
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting)
+    # This affects 'V1[0] = ' in HelAmps_sm.cc
+    def get_momenta_txt(self):
+        """Define the Header of the C++ file. This include
+            - momentum conservation
+            - definition of the impulsion"""
+        out = StringIO()
+        # Define all the required momenta
+        p = [] # a list for keeping track how to write the momentum
+        signs = self.get_momentum_conservation_sign()
+        for i, type in enumerate(self.particles):
+            if self.declaration.is_used( 'OM%s' % (i+1) ):
+                declname = 'OM%s' % (i+1)
+                if self.nodeclare: declname = 'const ' + self.declaration.codedict[declname]
+                out.write('    {3} = ( M{0} != {1} ? {2} / ( M{0} * M{0} ) : {1} );\n'.format( # AV use ternary in OM3
+                    i+1, '0.', '1.', declname)) # AV force scalar "1." instead of vector "one", add declaration
+            if i+1 == self.outgoing:
+                out_type = type
+                out_size = self.type_to_size[type]
+                continue
+            elif self.offshell:
+                if len(p) == 0 :
+                    p.append('{0}{1}{2}.pvec[%(i)s]'.format(signs[i],type,i+1,type)) # AV for clang-format (ugly!)
+                else:
+                    p.append(' ')
+                    p.append('{0} {1}{2}.pvec[%(i)s]'.format(signs[i],type,i+1,type))
+            if self.declaration.is_used('P%s' % (i+1)):
+                self.get_one_momenta_def(i+1, out)
+        # Define the resulting momenta
+        if self.offshell:
+            energy_pos = out_size -2
+            type = self.particles[self.outgoing-1]
+            if aloha.loop_mode:
+                size_p = 4
+            else:
+                size_p = 4
+            for i in range(size_p):
+                dict_energy = {'i':i}
+                out.write( '    %s%s.pvec[%s] = %s;\n' % ( type, self.outgoing, i, ''.join(p) % dict_energy ) )
+            if self.declaration.is_used( 'P%s' % self.outgoing ):
+                self.get_one_momenta_def( self.outgoing, out )
+        # Returning result
+        ###print('."' + out.getvalue() + '"') # AV - FOR DEBUGGING
+        return out.getvalue()
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting, add delayed declaration with initialisation)
+    # This affects 'P1[0] = ' in HelAmps_sm.cc
+    def get_one_momenta_def(self, i, strfile):
+        type = self.particles[i-1]
+        if aloha.loop_mode:
+            ptype = 'complex_v'
+        else:
+            ptype = 'double_v'
+        templateval ='%(sign)s%(type)s%(i)d.pvec[%(j)d]'
+        if self.nodeclare: strfile.write('    const %s P%d[4] = { ' % ( self.type2def[ptype], i) ) # AV
+        for j in range(4):
+            sign = self.get_P_sign(i) if self.get_P_sign(i) else '+' # AV
+            if self.nodeclare: template = templateval + ( ', ' if j<3 else '' ) # AV
+            else: template ='    P%(i)d[%(j)d] = ' + templateval + ';\n' # AV
+            strfile.write(template % {'j':j,'type': type, 'i': i, 'sign': sign}) # AV
+        if self.nodeclare: strfile.write(' };\n') # AV
+
+    def get_coupling_def(self):
+        """Define the coupling constant"""
+        # This is the same as the parent class method, but adapted for CUDACPP types
+
+        nb_coupling = 0 
+        for ftype, name in self.declaration:
+            if name.startswith('COUP'):
+                nb_coupling += 1
+
+        out = StringIO()
+        if 'M' not in self.tag:
+            if self.particles[0] != 'F':
+                return ''
+            # no matrix coupling, so a single coupling, so this is diagonal in flavor space
+            # but still need to check !
+            elif self.outgoing == 0  or self.particles[self.outgoing-1] not in ['F']:
+                if not self.outgoing:
+                    fail = "*vertex = cxzero_sv();"
+                else:
+                    fail = 'for(int i=0; i<%s%d.np4; i++) { w%s%d[i] = cxzero_sv(); }' % (self.particles[self.outgoing-1], self.outgoing, self.particles[self.outgoing-1], self.outgoing)
+
+                out.write('    const int & flv_index1 = F1.flv_index;\n')
+                out.write('    const int & flv_index2 = F2.flv_index;\n')
+                out.write('    if(flv_index1 != flv_index2 || flv_index1 == -1) {\n')
+                out.write('      %s\n' % fail)
+                out.write('      return;\n')
+                out.write('    }\n')
+            else:
+                incoming = [i+1 for i in range(len(self.particles)) if i+1 != self.outgoing and self.particles[self.outgoing-1] == 'F'][0]
+                outgoing = self.outgoing
+                out.write('    F%i.flv_index = F%i.flv_index;\n' % (outgoing, incoming))
+
+            return out.getvalue()
+
+        if self.outgoing == 0  or self.particles[self.outgoing-1] not in ['F']:
+            if not self.outgoing:
+                fail = "*vertex = cxzero_sv();"
+            else:
+                fail = 'for(int i=0; i<%s%d.np4; i++) { w%s%d[i] = cxzero_sv(); }' % (self.particles[self.outgoing-1], self.outgoing, self.particles[self.outgoing-1], self.outgoing)
+
+            out.write('    const int & flv_index1 = F1.flv_index;\n')
+            out.write('    const int & flv_index2 = F2.flv_index;\n')
+            if nb_coupling >1:
+                for i in range(1,nb_coupling+1):
+                    out.write('    int zero_coup%i = 0;\n' % i)
+                out.write('    if(flv_index1 != flv_index2 || flv_index1 == -1) {\n')
+                out.write('      %s\n' % fail)
+                out.write('      return;\n')
+                out.write('    }\n')
+            out.write('    if(flv_index1 == -1 || flv_index2 == -1) {\n')
+            out.write('      %s\n' % fail)
+            out.write('      return;\n')
+            out.write('    }\n')
+            if nb_coupling == 1:
+                out.write('    if(MCOUP.partner1[flv_index1] != flv_index2) {\n')
+                out.write('      %s\n' % fail)
+                out.write('      return;\n')
+                out.write('    }\n')
+            else:
+                for i in range(1,nb_coupling+1):
+                    out.write('    if(MCOUP%i.partner1[flv_index1] != flv_index2 || MCOUP%i.partner2[flv_index1] != flv_index2) {\n' %(i,i))
+                    out.write('      zero_coup%i = 1;\n' % i)
+                    out.write('      COUP%i = cxzero_sv();\n' % i)
+                    out.write('    }\n')
+            if nb_coupling ==1:
+                # the coupling is a complex number but in this case it is represented as a sequence of real numbers
+                # so, when we need to shift within the array, we need to double the shift width to account for
+                # both real and imaginary parts
+                out.write('    COUP = C_ACCESS::kernelAccessConst( MCOUP.value + 2*flv_index1 );\n')
+            else:
+                for i in range(1,nb_coupling+1):
+                    # the coupling is a complex number but in this case it is represented as a sequence of real numbers
+                    # so, when we need to shift within the array, we need to double the shift width to account for
+                    # both real and imaginary parts
+                    out.write('    if(zero_coup%i ==0) { COUP%i = C_ACCESS::kernelAccessConst( MCOUP%i.value + 2*flv_index1 ); }\n' % (i,i,i))
+        else:
+            incoming = [i+1 for i in range(len(self.particles)) if i+1 != self.outgoing and self.particles[self.outgoing-1] == 'F'][0]
+            if incoming %2 == 1:
+                outgoing = self.outgoing
+                out.write('    int flv_index%i = F%i.flv_index;\n' % (incoming, incoming))
+                out.write('    if(flv_index%i == -1) {\n' %(incoming))
+                out.write('      for(int i=0; i<F%i.np4; i++) { wF%i[i] = cxzero_sv(); }\n' % (outgoing, outgoing))
+                out.write('      F%i.flv_index = -1;\n' % outgoing)
+                out.write('      return;\n')
+                out.write('    }\n')
+                if nb_coupling == 1:
+                    out.write('    int flv_index2 = MCOUP.partner1[flv_index%i];\n' %(incoming))
+                else:
+                    out.write('    int flv_index2 = MCOUP1.partner1[flv_index%i];\n' %(incoming))
+                    for i in range(2,nb_coupling+1):
+                        out.write('    if(flv_index2 == -1){flv_index2 = MCOUP%i.partner1[flv_index%i];}' %(i, incoming)) 
+                out.write('    if(flv_index2 == -1){\n')
+                out.write('      for(int i=0; i<F%i.np4; i++) { wF%i[i] = cxzero_sv(); }\n' % (outgoing, outgoing))
+                out.write('      F%i.flv_index = -1;\n' % outgoing)
+                out.write('      return;\n')
+                out.write('    }\n')
+                out.write('    F%i.flv_index = flv_index2;\n' % outgoing)
+            else:
+                outgoing = self.outgoing
+                out.write('    int flv_index%i = F%i.flv_index;\n' % (incoming,incoming))
+                out.write('    if(flv_index%i == -1){\n' %(incoming))
+                out.write('      for(int i=0; i<F%i.np4; i++) { wF%i[i] = cxzero_sv(); }\n' % (outgoing, outgoing))
+                out.write('      F%i.flv_index = -1;\n' % outgoing)
+                out.write('      return;\n')
+                out.write('    }\n')
+                if nb_coupling == 1:
+                    out.write('    int flv_index1 = MCOUP.partner2[flv_index%i];\n' %(incoming))
+                else:
+                    out.write('    int flv_index1 = MCOUP1.partner2[flv_index%i];\n' %(incoming))
+                    for i in range(2,nb_coupling+1):
+                        out.write('    if(flv_index1 == -1) { flv_index1 = MCOUP%i.partner2[flv_index%i]; }' %(i, incoming))
+                out.write('    if(flv_index1 == -1){\n')
+                out.write('      for(int i=0; i<F%i.np4; i++) { wF%i[i] = cxzero_sv(); }\n' % (outgoing, outgoing))
+                out.write('      F%i.flv_index = -1;\n' % outgoing)
+                out.write('      return;\n')
+                out.write('    }\n')
+                out.write('    F%i.flv_index = flv_index1;\n' % outgoing)
+ 
+            for ftype, name in self.declaration:
+                if name.startswith('COUP'):
+                    # the coupling is a complex number but in this case it is represented as a sequence of real numbers
+                    # so, when we need to shift within the array, we need to double the shift width to account for
+                    # both real and imaginary parts
+                    out.write('    %s = C_ACCESS::kernelAccessConst( M%s.value + 2*flv_index1 );\n' % (name, name))
+        return out.getvalue()
+
+    # AV - modify aloha_writers.ALOHAWriterForCPP method (improve formatting)
+    # This is called once per FFV function, i.e. once per WriteALOHA instance?
+    # It is called by WriteALOHA.write, after get_header_txt, get_declaration_txt, get_momenta_txt, before get_foot_txt
+    # This affects 'denom = COUP' in HelAmps_sm.cc
+    # This affects 'V1[2] = ' and 'F1[2] = ' in HelAmps_sm.cc
+    # This affects 'TMP0 = ' in HelAmps_sm.cc
+    # This affects '( *vertex ) = ' in HelAmps_sm.cc
+    def define_expression(self):
+        """Write the helicity amplitude in C++ format"""
+        out = StringIO()
+        ###out.write('    mgDebug( 0, __FUNCTION__ );\n') # AV - NO! move to get_declaration.txt
+        if self.routine.contracted:
+            keys = sorted(self.routine.contracted.keys())
+            for name in keys:
+                obj = self.routine.contracted[name]
+                # This affects 'TMP0 = ' in HelAmps_sm.cc
+                ###out.write(' %s = %s;\n' % (name, self.write_obj(obj)))
+                if self.nodeclare:
+                    out.write('    const %s %s = %s;\n' %
+                              (self.type2def['complex_v'], name, self.write_obj(obj))) # AV
+                else:
+                    out.write('    %s = %s;\n' % (name, self.write_obj(obj))) # AV
+                    self.declaration.add(('complex', name))
+        for name, (fct, objs) in self.routine.fct.items():
+            format = ' %s = %s;\n' % (name, self.get_fct_format(fct))
+            out.write(format % ','.join([self.write_obj(obj) for obj in objs])) # AV not used in eemumu?
+        numerator = self.routine.expr
+        if not 'Coup(1)' in self.routine.infostr:
+            coup_name = 'COUP'
+        else:
+            coup_name = '%s' % self.change_number_format(1)
+        if not self.offshell:
+            if coup_name == 'COUP':
+                mydict = {'num': self.write_obj(numerator.get_rep([0]))} # '...(TMP4)-cI...' comes from here
+                for c in ['coup', 'vertex']:
+                    if self.type2def['pointer_%s' %c] in ['*']:
+                        mydict['pre_%s' %c] = '( *'
+                        mydict['post_%s' %c] = ' )'
+                    else:
+                        mydict['pre_%s' %c] = ''
+                        mydict['post_%s'%c] = ''
+                # This affects '( *vertex ) = ' in HelAmps_sm.cc
+                out.write('    %(pre_vertex)svertex%(post_vertex)s = Ccoeff * %(pre_coup)sCOUP%(post_coup)s * %(num)s;\n' % mydict) # OM add Ccoeff (fix #825)
+            else:
+                mydict= {}
+                if self.type2def['pointer_vertex'] in ['*']:
+                    mydict['pre_vertex'] = '( *'
+                    mydict['post_vertex'] = ' )'
+                else:
+                    mydict['pre_vertex'] = ''
+                    mydict['post_vertex'] = ''
+                mydict['data'] = self.write_obj(numerator.get_rep([0]))
+                # This affects '( *vertex ) = ' in HelAmps_sm.cc
+                out.write('    %(pre_vertex)svertex%(post_vertex)s = %(data)s;\n' % mydict)
+        else:
+            OffShellParticle = '%s%d' % (self.particles[self.offshell-1],\
+                                                                  self.offshell)
+            if 'L' not in self.tag:
+                coeff = 'denom'
+                mydict = {}
+                if self.type2def['pointer_coup'] in ['*']:
+                    mydict['pre_coup'] = '(*'
+                    mydict['post_coup'] = ')'
+                else:
+                    mydict['pre_coup'] = ''
+                    mydict['post_coup'] = ''
+                mydict['coup'] = coup_name
+                mydict['i'] = self.outgoing
+                if self.nodeclare:
+                    mydict['declnamedenom'] = 'const %s denom' % self.type2def['complex_v'] # AV
+                else:
+                    mydict['declnamedenom'] = 'denom' # AV
+                    self.declaration.add(('complex','denom'))
+                # Need to add the unary operator before the coupling (OM fix for #825)
+                if mydict['coup'] != 'one': # but in case where the coupling is not used (one)
+                    mydict['pre_coup'] = 'Ccoeff * %s' % mydict['pre_coup']
+                if not aloha.complex_mass:
+                    # This affects 'denom = COUP' in HelAmps_sm.cc
+                    if self.routine.denominator:
+                        if self.routine.denominator == '1':
+                            out.write('    %(declnamedenom)s = %(pre_coup)s%(coup)s%(post_coup)s\n' % mydict) # AV
+                        else:
+                            mydict['denom'] = self.routine.denominator
+                            out.write('    %(declnamedenom)s = %(pre_coup)s%(coup)s%(post_coup)s / ( %(denom)s )\n' % mydict) # AV
+                    else:
+                        out.write('    %(declnamedenom)s = %(pre_coup)s%(coup)s%(post_coup)s / ( ( P%(i)s[0] * P%(i)s[0] ) - ( P%(i)s[1] * P%(i)s[1] ) - ( P%(i)s[2] * P%(i)s[2] ) - ( P%(i)s[3] * P%(i)s[3] ) - M%(i)s * ( M%(i)s - cI * W%(i)s ) );\n' % mydict) # AV
+                else:
+                    if self.routine.denominator:
+                        raise Exception('modify denominator are not compatible with complex mass scheme')
+                    # This affects 'denom = COUP' in HelAmps_sm.cc
+                    out.write('    %(declnamedenom)s = %(pre_coup)s%(coup)s%(post_coup)s / ( ( P%(i)s[0] * P%(i)s[0] ) - ( P%(i)s[1] *P%(i)s[1] ) - ( P%(i)s[2] * P%(i)s[2] ) - ( P%(i)s[3] * P%(i)s[3] ) - ( M%(i)s * M%(i)s ) );\n' % mydict) # AV
+                ###self.declaration.add(('complex','denom')) # AV moved earlier (or simply removed)
+                if aloha.loop_mode: ptype = 'list_complex'
+                else: ptype = 'list_double'
+                self.declaration.add((ptype,'P%s' % self.outgoing))
+            else:
+                coeff = 'COUP'
+            for ind in numerator.listindices():
+                # This affects 'V1[2] = ' and 'F1[2] = ' in HelAmps_sm.cc
+                ###out.write('    %s[%d]= %s*%s;\n' % (self.outname,
+                out.write('    %s[%d] = %s * %s;\n' % (self.outname, # AV
+                                        self.pass_to_HELAS(ind), coeff,
+                                        self.write_obj(numerator.get_rep(ind))))
+        out.write('    mgDebug( 1, __FUNCTION__ );\n') # AV
+        out.write('    return;\n') # AV
+        ###return out.getvalue() # AV
+        # AV check if one, two, half or quarter are used and need to be defined (ugly hack for #291: can this be done better?)
+        out2 = StringIO()
+        if 'one' in out.getvalue(): out2.write('    constexpr fptype one( 1. );\n')
+        if 'two' in out.getvalue(): out2.write('    constexpr fptype two( 2. );\n')
+        if 'half' in out.getvalue(): out2.write('    constexpr fptype half( 1. / 2. );\n')
+        if 'quarter' in out.getvalue(): out2.write('    constexpr fptype quarter( 1. / 4. );\n')
+        out2.write( out.getvalue() )
+        return out2.getvalue()
+
+    # AV - modify aloha_writers.WriteALOHA method (improve formatting)
+    def write_MultVariable(self, obj, prefactor=True):
+        """Turn a multvariable into a string"""
+        mult_list = [self.write_variable_id(id) for id in obj]
+        ###data = {'factors': '*'.join(mult_list)}
+        data = {'factors': ' * '.join(mult_list)}
+        if prefactor and obj.prefactor != 1:
+            if obj.prefactor != -1:
+                text = '%(prefactor)s * %(factors)s'
+                data['prefactor'] = self.change_number_format(obj.prefactor)
+            else:
+                text = '-%(factors)s' # AV keep default (this is not used in eemumu)
+        else:
+            text = '%(factors)s'
+        return text % data
+
+    # AV - modify aloha_writers.WriteALOHA method (improve formatting)
+    def write_MultContainer(self, obj, prefactor=True):
+        """Turn a multvariable into a string"""
+        mult_list = [self.write_obj(id) for id in obj]
+        ###data = {'factors': '*'.join(mult_list)}
+        data = {'factors': ' * '.join(mult_list)} # AV
+        if prefactor and obj.prefactor != 1:
+            if obj.prefactor != -1:
+                text = '%(prefactor)s * %(factors)s'
+                data['prefactor'] = self.change_number_format(obj.prefactor)
+            else:
+                text = '-%(factors)s' # AV keep default (this is not used in eemumu)
+        else:
+            text = '%(factors)s'
+        return text % data
+
+    def shift_indices(self, match):
+        """shift the indices for non impulsion object"""
+        if match.group('var').startswith('P'):
+            shift = 0
+            return '%s[%s]' % (match.group('var'), int(match.group('num')) + shift) 
+        else:
+            shift =  -1
+            return 'w%s[%s]' % (match.group('var'), int(match.group('num')) + shift)
+
+    # OM - overload aloha_writers.WriteALOHA and ALOHAWriterForCPP methods (handle 'unary minus' #628)
+    def change_var_format(self, obj):
+        """ """
+        if obj.startswith('COUP'):
+            out = super().change_var_format(obj)
+            postfix = out[4:]
+            return "Ccoeff%s * %s" % (postfix, out) # OM for 'unary minus' #628
+        else:
+            return super().change_var_format(obj)
+
+    # AV - new method (based on implementation of write_obj and write_MultVariable)
+    def objIsSimpleVariable(self, obj) :
+        ###print ( obj.vartype, obj.prefactor, len( obj ), obj ) # AV - FOR DEBUGGING
+        return ( obj.vartype == 0 ) or ( obj.vartype == 2 and len( obj ) == 1 )
+
+    # AV - modify aloha_writers.WriteALOHA method (improve formatting)
+    # This affects 'V1[2] = ' and 'F1[2] = ' in HelAmps_sm.cc
+    def write_obj_Add(self, obj, prefactor=True):
+        """Turns addvariable into a string"""
+        data = defaultdict(list)
+        number = []
+        [data[p.prefactor].append(p) if hasattr(p, 'prefactor') else number.append(p) for p in obj]
+        file_str = StringIO()
+        if prefactor and obj.prefactor != 1:
+            formatted = self.change_number_format(obj.prefactor)
+            if formatted.startswith(('+','-')):
+                file_str.write('( %s )' % formatted)
+            else:
+                file_str.write(formatted)
+            file_str.write(' * ( ')
+        else:
+            file_str.write('( ')
+        ###print('."'+file_str.getvalue()+'"') # AV - FOR DEBUGGING
+        first=True
+        for value, obj_list in data.items():
+            ###print('.."' + str(value) + '" "' + str(obj_list) + '"') # AV - FOR DEBUGGING
+            add= ' + '
+            if value not in  [-1,1]:
+                nb_str = self.change_number_format(value)
+                ###print('.>"' + nb_str + '"') # AV - FOR DEBUGGING
+                if nb_str[0] in ['+', '-']:
+                    if first: file_str.write(nb_str)
+                    else : file_str.write(' ' + nb_str[0] + ' ' + nb_str[1:])
+                elif first and nb_str == '( half * cI )':
+                    file_str.write('half * cI')
+                elif not first and nb_str == '( -half * cI )':
+                    file_str.write(' - half * cI')
+                else:
+                    file_str.write('+' if first else ' + ')
+                    file_str.write(nb_str)
+                file_str.write(' * ')
+                if len( obj_list ) > 1 or not self.objIsSimpleVariable( obj_list[0] ) : file_str.write('( ')
+            elif value == -1:
+                add = ' - '
+                file_str.write('-' if first else ' - ')
+            elif not first:
+                file_str.write(' + ')
+            else:
+                file_str.write('')
+            first = False
+            # AV comment: write_obj here also adds calls declaration_add (via change_var_format) - example: OM3
+            ###print('..."'+file_str.getvalue()+'"') # AV - FOR DEBUGGING
+            file_str.write( add.join( [self.write_obj(obj, prefactor=False) for obj in obj_list] ) ) # NB: RECURSIVE! (write_obj_Add calls write_obj...)
+            ###print('...."'+file_str.getvalue()+'"') # AV - FOR DEBUGGING
+            if value not in [1,-1]:
+                if len( obj_list ) > 1 or not self.objIsSimpleVariable( obj_list[0] ) : file_str.write(' )')
+        if number:
+            total = sum(number)
+            file_str.write('+ %s' % self.change_number_format(total))
+        file_str.write(' )')
+        ###print('....."'+file_str.getvalue()+'"') # AV - FOR DEBUGGING
+        return file_str.getvalue()
+
+#------------------------------------------------------------------------------------
+
+from os.path import join as pjoin
+
+# AV - define a custom UFOModelConverter
+# (NB: enable this via PLUGIN_ProcessExporter.create_model_class in output.py)
+class PLUGIN_UFOModelConverter(PLUGIN_export_cpp.UFOModelConverterGPU):
+    # Class structure information
+    #  - object
+    #  - UFOModelConverterCPP(object) [in madgraph/iolibs/export_cpp.py]
+    #  - UFOModelConverterGPU(UFOModelConverterCPP) [in madgraph/iolibs/export_cpp.py]
+    #  - PLUGIN_UFOModelConverter(UFOModelConverterGPU)
+    #      This class
+
+    # AV - keep defaults from export_cpp.UFOModelConverterCPP
+    ###include_dir = '.'
+    ###c_file_dir = '.'
+    ###param_template_h = 'cpp_model_parameters_h.inc'
+    ###param_template_cc = 'cpp_model_parameters_cc.inc'
+
+    # AV - change defaults from export_cpp.UFOModelConverterCPP
+    # (custom tag to appear in 'This file has been automatically generated for')
+    output_name = 'CUDA/C++ standalone'
+
+    # AV - change defaults from export_cpp.UFOModelConverterGPU
+    ###cc_ext = 'cu' # create HelAmps_sm.cu
+    cc_ext = 'cc' # create HelAmps_sm.cc
+
+    # AV - keep defaults from export_cpp.UFOModelConverterGPU
+    ###cc_ext = 'cu'
+    ###aloha_template_h = pjoin('gpu','cpp_hel_amps_h.inc')
+    ###aloha_template_cc = pjoin('gpu','cpp_hel_amps_cc.inc')
+    ###helas_h = pjoin('gpu', 'helas.h')
+    ###helas_cc = pjoin('gpu', 'helas.cu')
+
+    # AV - use a custom ALOHAWriter (NB: this is an argument to WriterFactory.__new__, either a string or a class!)
+    ###aloha_writer = 'cudac' # WriterFactory will use ALOHAWriterForGPU
+    aloha_writer = PLUGIN_ALOHAWriter # WriterFactory will use ALOHAWriterForGPU
+
+    # AV - use template files from PLUGINDIR instead of MG5DIR; strip leading copyright lines
+    def read_aloha_template_files(self, ext):
+        """Read all ALOHA template files with extension ext, strip them of
+        compiler options and namespace options, and return in a list"""
+        ###path = pjoin(MG5DIR, 'aloha','template_files')
+        path = pjoin(PLUGINDIR, 'aloha', 'template_files')
+        out = []
+        if ext == 'h': file = open(pjoin(path, self.helas_h)).read()
+        else: file = open(pjoin(path, self.helas_cc)).read()
+        file = '\n'.join( file.split('\n')[9:] ) # skip first 9 lines in helas.h/cu (copyright including ALOHA)
+        out.append( file )
+        return out
+
+    # AV - use the plugin's PLUGIN_OneProcessExporter template_path and __template_path (for aloha_template_h/cc)
+    @classmethod
+    def read_template_file(cls, filename, classpath=False):
+        """Open a template file and return the contents."""
+        ###return OneProcessExporterCPP.read_template_file(filename, classpath)
+        return PLUGIN_OneProcessExporter.read_template_file(filename, classpath)
+
+    # AV - overload export_cpp.UFOModelConverterCPP method (improve formatting)
+    def write_parameters(self, params):
+        res = super().write_parameters(params)
+        res = res.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics)
+        res = res.replace('\n','\n    ')
+        res = res.replace(',',', ')
+        if res == '' : res = '    // (none)'
+        else : res = '    ' + res # add leading '  ' after the '// Model' line
+        return res
+
+    def write_flv_couplings(self, params):
+        """Write out the lines of independent parameters"""
+
+        def_flv = []
+        # For each parameter, write name = expr;
+        for coupl in params:
+            for key, c in coupl.flavors.items():
+                # get first/second index
+                k1, k2 = [i for i in key if i!=0]
+                def_flv.append('%(name)s.partner1[%(in)i] = %(out)i;' % {'name': coupl.name,'in': k1-1, 'out': k2-1})
+                def_flv.append('%(name)s.partner2[%(out)i] = %(in)i;' % {'name': coupl.name,'in': k1-1, 'out': k2-1})
+                def_flv.append('%(name)s.value[%(in)i] = &%(coupl)s;' % {'name': coupl.name,'in': k1-1, 'coupl': c})
+
+        return "\n  ".join(def_flv)
+
+    # AV - overload export_cpp.UFOModelConverterCPP method (improve formatting)
+    def write_set_parameters(self, params):
+        res = self.super_write_set_parameters_donotfixMajorana(params)
+        res = res.replace('(','( ')
+        res = res.replace(')',' )')
+        res = res.replace('+',' + ')
+        res = res.replace('-',' - ')
+        res = res.replace('e + ','e+') # fix exponents
+        res = res.replace('e - ','e-') # fix exponents
+        res = res.replace('=  + ','= +') # fix leading + in assignmments
+        res = res.replace('=  - ','= -') # fix leading - in assignmments
+        res = res.replace('*',' * ')
+        res = res.replace('/',' / ')
+        res = res.replace(',',', ')
+        res = res.replace(',  ',', ')
+        res = res.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics)
+        if res == '' : res = '// (none)'
+        res = res.replace('\n','\n  ')
+        res = res.replace('(  - ','( -') # post-fix for susy
+        res = res.replace(',  - ',', -') # post-fix for susy
+        res = res.replace('Re+mdl','Re + mdl') # post-fix for smeft
+        res = res.replace('Re+0','Re + 0') # post-fix for smeft
+        res = res.replace('He-2','He - 2') # post-fix for smeft
+        res = res.replace(', - ',', -') # post-fix for smeft
+        ###misc.sprint( "'"+res+"'" )
+        return res
+
+    # AV - new method (merging write_parameters and write_set_parameters)
+    def write_hardcoded_parameters(self, params, deviceparams=dict()):
+        majorana_widths = []
+        for particle in self.model.get('particles'):
+            if particle.is_fermion() and particle.get('self_antipart') and \
+                   particle.get('width').lower() != 'zero':
+                majorana_widths.append( particle.get('width') )
+        ###misc.sprint(params) # for debugging
+        pardef = super().write_parameters(params)
+        parset = self.super_write_set_parameters_donotfixMajorana(params)
+        ###print( '"' + pardef + '"' )
+        ###print( '"' + parset + '"' )
+        if ( pardef == '' ):
+            assert parset == '', "pardef is empty but parset is not: '%s'"%parset # AV sanity check (both are empty)
+            res = '// (none)\n'
+            return res
+	#=== Replace patterns in pardef (left of the assignment '=')
+        pardef = pardef.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics)
+	#=== Replace patterns in parset (right of the assignment '=')
+        parset = parset.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics)
+        parset = parset.replace('sqrt(','constexpr_sqrt(') # constexpr sqrt (based on iterative Newton-Raphson approximation)
+        parset = parset.replace('pow(','constexpr_pow(') # constexpr pow
+        parset = parset.replace('atan(','constexpr_atan(') # constexpr atan for BSM #627
+        parset = parset.replace('sin(','constexpr_sin(') # constexpr sin for BSM #627
+        parset = parset.replace('cos(','constexpr_cos(') # constexpr cos for BSM #627
+        parset = parset.replace('tan(','constexpr_tan(').replace('aconstexpr_tan(','atan(') # constexpr tan for BSM #627
+        parset = parset.replace('(','( ')
+        parset = parset.replace(')',' )')
+        parset = parset.replace('+',' + ')
+        parset = parset.replace('-',' - ')
+        parset = parset.replace('e + ','e+') # fix exponents
+        parset = parset.replace('e - ','e-') # fix exponents
+        parset = parset.replace('=  + ','= +') # fix leading + in assignmments
+        parset = parset.replace('=  - ','= -') # fix leading - in assignmments
+        parset = parset.replace('*',' * ')
+        parset = parset.replace('/',' / ')
+        parset = parset.replace(',',', ')
+	#=== Compute pardef_lines from pardef (left of the assignment '=')
+        pardef_lines = {}
+        for line in pardef.split('\n'):
+            ###print(line) # for debugging
+            type, pars = line.rstrip(';').split(' ') # strip trailing ';'
+            for par in pars.split(','):
+                ###print(len(pardef_lines), par) # for debugging
+                if par in majorana_widths:
+                    pardef_lines[par] = ( 'constexpr ' + type + ' ' + par + "_abs" )
+                elif par in deviceparams:
+                    pardef_lines[par] = ( '__device__ constexpr ' + type + ' ' + par )
+                else:
+                    pardef_lines[par] = ( 'constexpr ' + type + ' ' + par )
+        ###misc.sprint( 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) )
+        ###print( pardef_lines ) # for debugging
+        ###for line in pardef_lines: misc.sprint(line) # for debugging
+	#=== Compute parset_lines from parset (right of the assignment '=')
+        parset_pars = []
+        parset_lines = {}
+        skipnextline = False
+        for iline, line in enumerate(parset.split('\n')):
+            ###print(iline, line) # for debugging
+            if line.startswith('indices'):
+                ###print('WARNING! Skip line with leading "indices" :', line)
+                continue # skip line with leading "indices", before slha.get_block_entry (#622)
+            par, parval = line.split(' = ')
+            ###misc.sprint(len(parset_pars), len(parset_lines), par, parval) # for debugging
+            if parval.startswith('slha.get_block_entry'): parval = parval.split(',')[2].lstrip(' ').rstrip(');') + ';'
+            parset_pars.append( par )
+            parset_lines[par] = parval # includes a trailing ';'
+        ###misc.sprint( 'parset_pars size =', len(parset_pars) )
+        ###misc.sprint( 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) )
+        ###print( parset_lines ) # for debugging
+        ###for line in parset_lines: misc.sprint(line) # for debugging
+	#=== Assemble pardef_lines and parset_lines into a single string res and then replace patterns in res
+        assert len(pardef_lines) == len(parset_lines), 'len(pardef_lines) != len(parset_lines)' # AV sanity check (same number of parameters)
+        res = '    '.join( pardef_lines[par] + ' = ' + parset_lines[par] + '\n' for par in parset_pars ) # no leading '    ' on first row
+        res = res.replace(' ;',';')
+        res = res.replace('= - ','= -') # post-fix for susy
+        res = res.replace('(  - ','( -') # post-fix for susy
+        res = res.replace('Re+mdl','Re + mdl') # better post-fix for smeft #633
+        res = res.replace('Re+0','Re + 0') # better post-fix for smeft #633
+        res = res.replace('He-2','He - 2') # better post-fix for smeft #633
+        res = res.replace(',  - ',', -') # post-fix for smeft
+        ###print(res); assert(False)
+        ###misc.sprint( "'"+res+"'" )
+        return res
+
+    # AV - replace export_cpp.UFOModelConverterCPP method (split writing of parameters and fixes for Majorana particles #622)
+    def super_write_set_parameters_donotfixMajorana(self, params):
+        """Write out the lines of independent parameters"""
+        res_strings = []
+        # For each parameter, write "name = expr;"
+        for param in params:
+            res_strings.append( "%s" % param.expr )
+        res = "\n".join(res_strings)
+        res = res.replace('ABS(','std::abs(') # for SMEFT #614 and #616
+        return res
+
+    # AV - replace export_cpp.UFOModelConverterCPP method (eventually split writing of parameters and fixes for Majorana particles #622)
+    def super_write_set_parameters_onlyfixMajorana(self, hardcoded): # FIXME! split hardcoded (constexpr) and not-hardcoded code
+        """Write out the lines of independent parameters"""
+        print( 'super_write_set_parameters_onlyfixMajorana (hardcoded=%s)'%hardcoded )
+        res_strings = []
+        # Correct width sign for Majorana particles (where the width and mass need to have the same sign)        
+        prefix = "  " if hardcoded else "" # hardcoded code goes into Parameters.h and needs two extra leading spaces due to a namespace...
+        for particle in self.model.get('particles'):
+            if particle.is_fermion() and particle.get('self_antipart') and \
+                   particle.get('width').lower() != 'zero':
+                if hardcoded:
+                    res_strings.append( prefix+"  constexpr int %s_sign = ( %s < 0 ? -1 : +1 );" % ( particle.get('width'), particle.get('mass') ) )
+                    res_strings.append( prefix+"  constexpr double %(W)s = %(W)s_sign * %(W)s_abs;" % { 'W' : particle.get('width') } )
+                else:
+                    res_strings.append( prefix+"  if( %s < 0 )" % particle.get('mass'))
+                    res_strings.append( prefix+"    %(width)s = -std::abs( %(width)s );" % {"width": particle.get('width')})
+        if len( res_strings ) != 0 : res_strings = [ prefix + "  // Fixes for Majorana particles" ] + res_strings
+        if not hardcoded: return '\n' + '\n'.join(res_strings) if res_strings else ''
+        else: return '\n' + '\n'.join(res_strings) + '\n' if res_strings else '\n'
+
+    # AV - replace export_cpp.UFOModelConverterCPP method (add hardcoded parameters and couplings)
+    def super_generate_parameters_class_files(self):
+        """Create the content of the Parameters_model.h and .cc files"""
+        # First of all, identify which extra independent parameters must be made available through CPU static and GPU constant memory in BSM models
+        # because they are used in the event by event calculation of alphaS-dependent couplings
+        bsmparam_indep_real_used = []
+        bsmparam_indep_complex_used = []
+        for param in self.params_indep: # NB this is now done also for 'sm' processes (no check on model name, see PR #824)
+            if param.name == 'mdl_complexi' : continue
+            if param.name == 'aS' : continue
+            # Add BSM parameters which are needed to compute dependent couplings
+            # Note: this seemed enough to fix SUSY processes, but not EFT processes
+            for coupdep in self.coups_dep.values():
+                if param.name in coupdep.expr:
+                ###if ' '+param.name+' ' in coupdep.expr: # this is not enough, see review of PR #824 and mg5amcnlo#90
+                    if param.type == 'real':
+                        bsmparam_indep_real_used.append( param.name )
+                    elif param.type == 'complex':
+                        bsmparam_indep_complex_used.append( param.name )
+            # Add BSM parameters which are needed to compute dependent parameters
+            # Note: this was later added to also fix EFT processes (related to #616)
+            for pardep in self.params_dep:
+                if param.name in pardep.expr:
+                ###if param.name in pardep.expr: # this is not enough, see review of PR #824 and mg5amcnlo#90
+                    if param.type == 'real':
+                        bsmparam_indep_real_used.append( param.name )
+                    elif param.type == 'complex':
+                        bsmparam_indep_complex_used.append( param.name )
+        # Use dict.fromkeys() instead of set() to ensure a reproducible ordering of parameters (see https://stackoverflow.com/a/53657523)
+        bsmparam_indep_real_used = dict.fromkeys( bsmparam_indep_real_used ) 
+        bsmparam_indep_complex_used = dict.fromkeys( bsmparam_indep_complex_used ) 
+        # Then do everything else
+        replace_dict = self.default_replace_dict
+        replace_dict['info_lines'] = PLUGIN_export_cpp.get_mg5_info_lines()
+        params_indep = [ line.replace('aS, ','')
+                         for line in self.write_parameters(self.params_indep).split('\n') ]
+        replace_dict['independent_parameters'] = '// Model parameters independent of aS\n    //double aS; // now retrieved event-by-event (as G) from Fortran (running alphas #373)\n' + '\n'.join( params_indep )
+        replace_dict['independent_couplings'] = '// Model couplings independent of aS\n' + self.write_parameters(self.coups_indep)
+        params_dep = [ '    //' + line[4:] + ' // now computed event-by-event (running alphas #373)' for line in self.write_parameters(self.params_dep).split('\n') ]
+        replace_dict['dependent_parameters'] = '// Model parameters dependent on aS\n' + '\n'.join( params_dep )
+        coups_dep = [ '    //' + line[4:] + ' // now computed event-by-event (running alphas #373)' for line in self.write_parameters(list(self.coups_dep.values())).split('\n') ]
+        replace_dict['dependent_couplings'] = '// Model couplings dependent on aS\n' + '\n'.join( coups_dep )
+        replace_dict['flavor_independent_couplings'] = \
+                                    "// Model flavor couplings independent of aS\n" + \
+                                    self.write_parameters([c for c in self.coups_flv_indep])
+        replace_dict['flavor_dependent_couplings'] = \
+                                    "// Model flavor couplings dependent of aS\n" + \
+                                    self.write_parameters([c for c in self.coups_flv_dep])                                    
+        set_params_indep = [ line.replace('aS','//aS') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)'
+                             if line.startswith( '  aS =' ) else
+                             line for line in self.write_set_parameters(self.params_indep).split('\n') ]
+        replace_dict['set_independent_parameters'] = '\n'.join( set_params_indep )
+        replace_dict['set_independent_parameters'] += self.super_write_set_parameters_onlyfixMajorana( hardcoded=False ) # add fixes for Majorana particles only in the aS-indep parameters #622
+        replace_dict['set_independent_parameters'] += '\n  // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings;' # NB this is now done also for 'sm' processes (no check on model name, see PR #824)
+        replace_dict['set_flv_couplings'] = self.write_flv_couplings(self.coups_flv_dep+self.coups_flv_indep)    
+        if len(bsmparam_indep_real_used) + len(bsmparam_indep_complex_used) > 0:
+            for ipar, par in enumerate( bsmparam_indep_real_used ):
+                replace_dict['set_independent_parameters'] += '\n  mdl_bsmIndepParam[%i] = %s;' % ( ipar, par )
+            for ipar, par in enumerate( bsmparam_indep_complex_used ):
+                replace_dict['set_independent_parameters'] += '\n  mdl_bsmIndepParam[%i] = %s.real();' % ( len(bsmparam_indep_real_used) + 2 * ipar, par )
+                replace_dict['set_independent_parameters'] += '\n  mdl_bsmIndepParam[%i] = %s.imag();' % ( len(bsmparam_indep_real_used) + 2 * ipar + 1, par )
+        else:
+            replace_dict['set_independent_parameters'] += '\n  // (none)'
+        replace_dict['set_independent_couplings'] = self.write_set_parameters(self.coups_indep)
+        replace_dict['set_dependent_parameters'] = self.write_set_parameters(self.params_dep)
+        replace_dict['set_dependent_couplings'] = self.write_set_parameters(list(self.coups_dep.values()))
+        print_params_indep = [ line.replace('std::cout','//std::cout') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)'
+                               if '"aS =' in line else
+                               line for line in self.write_print_parameters(self.params_indep).split('\n') ]
+        replace_dict['print_independent_parameters'] = '\n'.join( print_params_indep )
+        replace_dict['print_independent_couplings'] = self.write_print_parameters(self.coups_indep)
+        replace_dict['print_dependent_parameters'] = self.write_print_parameters(self.params_dep)
+        replace_dict['print_dependent_couplings'] = self.write_print_parameters(list(self.coups_dep.values()))
+        if 'include_prefix' not in replace_dict:
+            replace_dict['include_prefix'] = ''
+        assert super().write_parameters([]) == '', 'super().write_parameters([]) is not empty' # AV sanity check (#622)
+        assert self.super_write_set_parameters_donotfixMajorana([]) == '', 'super_write_set_parameters_donotfixMajorana([]) is not empty' # AV sanity check (#622)
+        ###misc.sprint(self.params_indep) # for debugging
+        hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep, {**bsmparam_indep_real_used, **bsmparam_indep_complex_used}).split('\n') if line != '' ] # use bsmparam_indep_real_used + bsmparam_indep_complex_used as deviceparams (dictionary merge as in https://stackoverflow.com/a/26853961)
+        replace_dict['hardcoded_independent_parameters'] = '\n'.join( hrd_params_indep ) + self.super_write_set_parameters_onlyfixMajorana( hardcoded=True ) # add fixes for Majorana particles only in the aS-indep parameters #622
+        ###misc.sprint(self.coups_indep) # for debugging
+        replace_dict['hardcoded_independent_couplings'] = self.write_hardcoded_parameters(self.coups_indep)
+        ###misc.sprint(self.params_dep) # for debugging
+        hrd_params_dep = [ line.replace('constexpr ','//constexpr ') + ' // now computed event-by-event (running alphas #373)' if line != '' else line for line in self.write_hardcoded_parameters(self.params_dep).split('\n') ]
+        replace_dict['hardcoded_dependent_parameters'] = '\n'.join( hrd_params_dep )
+        ###misc.sprint(self.coups_dep) # for debugging
+        hrd_coups_dep = [ line.replace('constexpr','//constexpr') + ' // now computed event-by-event (running alphas #373)' if line != '' else line for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') ]
+        replace_dict['hardcoded_dependent_couplings'] = '\n'.join( hrd_coups_dep )
+        replace_dict['nicoup'] = len( self.coups_indep )
+        if len( self.coups_indep ) > 0 :
+            iicoup = [ '    //constexpr size_t ixcoup_%s = %d + Parameters_dependentCouplings::ndcoup; // out of ndcoup+nicoup' % (par.name, id) for (id, par) in enumerate(self.coups_indep) ]
+            replace_dict['iicoup'] = '\n'.join( iicoup )
+        else:
+            replace_dict['iicoup'] = '    // NB: there are no aS-independent couplings in this physics process'
+        replace_dict['ndcoup'] = len( self.coups_dep )
+        if len( self.coups_dep ) > 0 :
+            idcoup = [ '    constexpr size_t idcoup_%s = %d;' % (name, id) for (id, name) in enumerate(self.coups_dep) ]
+            replace_dict['idcoup'] = '\n'.join( idcoup )
+            dcoupdecl = [ '      cxtype_sv %s;' % name for name in self.coups_dep ]
+            replace_dict['dcoupdecl'] = '\n'.join( dcoupdecl )
+            dcoupsetdpar = []
+            # Special handling of G and aS parameters (cudacpp starts from G, while UFO starts from aS)
+            # For simplicity, compute these parameters directly from G, rather than from another such parameter
+            # (e.g. do not compute mdl_sqrt__aS as sqrt of aS, which would require defining aS first)
+            gparameters = { 'aS' : 'G * G / 4. / M_PI',
+                            'mdl_sqrt__aS' : 'G / 2. / constexpr_sqrt( M_PI )' }
+            gparamcoded = set()
+            foundG = False
+            for pdep in self.params_dep:
+                ###misc.sprint(pdep.type, pdep.name) 
+                line = '    ' + self.write_hardcoded_parameters([pdep]).rstrip('\n')
+                ###misc.sprint(line)
+                if not foundG:
+                    # Comment out the default UFO assignment of mdl_sqrt__aS (from aS) and of G (from mdl_sqrt__aS), but keep them for reference
+                    # (WARNING! This Python CODEGEN code essentially assumes that this refers precisely and only to mdl_sqrt__aS and G)
+                    dcoupsetdpar.append( '    ' + line.replace('constexpr double', '//const fptype_sv') )
+                ###elif pdep.name == 'mdl_G__exp__2' : # added for UFO mg5amcnlo#89 (complex in susy, should be double as in heft/smeft), now fixed
+                ###    # Hardcode a custom assignment (valid for both SUSY and SMEFT) instead of replacing double or complex by fptype in 'line'
+                ###    dcoupsetdpar.append('        const fptype_sv ' + pdep.name + ' = G * G;' )
+                ###elif pdep.name == 'mdl_G__exp__3' : # added for UFO mg5amcnlo#89 (complex in smeft, should be double), now fixed, may be removed
+                ###    # Hardcode a custom assignment (valid for both SUSY and SMEFT) instead of replacing double or complex by fptype in 'line'
+                ###    dcoupsetdpar.append('        const fptype_sv ' + pdep.name + ' = G * G * G;' )
+                elif pdep.name in gparameters:
+                    # Skip the default UFO assignment from aS (if any?!) of aS and mdl_sqrt__aS, as these are now derived from G
+                    # (WARNING! no path to this statement! aS is not in params_dep, while mdl_sqrt__aS is handled in 'if not foundG' above)
+                    ###misc.sprint('Skip gparameter:', pdep.name)
+                    continue
+                else:
+                    for gpar in gparameters:
+                        if ' ' + gpar + ' ' in line and not gpar in gparamcoded:
+                            gparamcoded.add(gpar)
+                            dcoupsetdpar.append('        const fptype_sv ' + gpar + ' = ' + gparameters[gpar] + ';' )
+                    dcoupsetdpar.append( '    ' + line.replace('constexpr double', 'const fptype_sv') )
+                if pdep.name == 'G':
+                    foundG = True
+                    dcoupsetdpar.append('        // *** NB Compute all dependent parameters, including aS, in terms of G rather than in terms of aS ***')
+            replace_dict['dcoupsetdpar'] = '\n'.join( dcoupsetdpar )
+            dcoupsetdcoup = [ '    ' + line.replace('constexpr cxsmpl<double> ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ]
+            replace_dict['dcoupsetdcoup'] = '    ' + '\n'.join( dcoupsetdcoup )
+            dcoupaccessbuffer = [ '    fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
+            replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n'
+            dcoupkernelaccess = [ '    cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
+            replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n'
+            dcoupcompute = [ '    %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ]
+            replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute )
+            # Special handling in EFT for fptype=float using SIMD
+            dcoupoutfptypev2 = [ '      fptype_v %sr_v;\n      fptype_v %si_v;'%(name,name) for name in self.coups_dep ]
+            replace_dict['dcoupoutfptypev2'] = ( '\n' if len(self.coups_dep) > 0 else '' ) + '\n'.join( dcoupoutfptypev2 )
+            replace_dict['dcoupsetdpar2'] = replace_dict['dcoupsetdpar'].replace('fptype_sv','fptype')
+            dcoupsetdcoup2 = [ '    ' + line.replace('constexpr cxsmpl<double> ','const cxtype ').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ]
+            dcoupsetdcoup2 += [ '        %sr_v[i] = cxreal( %s );\n        %si_v[i] = cximag( %s );'%(name,name,name,name) for name in self.coups_dep ]
+            replace_dict['dcoupsetdcoup2'] = '  ' + '\n'.join( dcoupsetdcoup2 )
+            dcoupoutdcoup2 = [ '      out.%s = cxtype_v( %sr_v, %si_v );'%(name,name,name) for name in self.coups_dep ]
+            replace_dict['dcoupoutdcoup2'] = '\n' + '\n'.join( dcoupoutdcoup2 )
+            for par in bsmparam_indep_complex_used:
+                replace_dict['dcoupsetdcoup'] = replace_dict['dcoupsetdcoup'].replace( par, '(cxtype)'+par )
+                replace_dict['dcoupsetdcoup2'] = replace_dict['dcoupsetdcoup2'].replace( par, '(cxtype)'+par )
+        else:
+            replace_dict['idcoup'] = '    // NB: there are no aS-dependent couplings in this physics process'
+            replace_dict['dcoupdecl'] = '      // (none)'
+            replace_dict['dcoupsetdpar'] = '        // (none)'
+            replace_dict['dcoupsetdcoup'] = '        // (none)'
+            replace_dict['dcoupaccessbuffer'] = ''
+            replace_dict['dcoupkernelaccess'] = ''
+            replace_dict['dcoupcompute'] = '    // NB: there are no aS-dependent couplings in this physics process'
+            # Special handling in EFT for fptype=float using SIMD
+            replace_dict['dcoupoutfptypev2'] = ''
+            replace_dict['dcoupsetdpar2'] = '        // (none)'
+            replace_dict['dcoupsetdcoup2'] = '      // (none)'
+            replace_dict['dcoupoutdcoup2'] = ''
+        # Require HRDCOD=1 in EFT and special handling in EFT for fptype=float using SIMD
+        nbsmparam_indep_all_used = len( bsmparam_indep_real_used ) + 2 * len( bsmparam_indep_complex_used )
+        replace_dict['max_flavor'] = max(len(ids) for ids in self.model['merged_particles'].values())
+        replace_dict['bsmdefine'] = '#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1' if nbsmparam_indep_all_used > 0 else '#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0'
+        replace_dict['nbsmip'] = nbsmparam_indep_all_used # NB this is now done also for 'sm' processes (no check on model name, see PR #824)
+        replace_dict['hasbsmip'] = '' if nbsmparam_indep_all_used > 0 else '//'
+        replace_dict['bsmip'] = ', '.join( list(bsmparam_indep_real_used) + [ '%s.real(), %s.imag()'%(par,par) for par in bsmparam_indep_complex_used] ) if nbsmparam_indep_all_used > 0 else '(none)'
+        replace_dict['eftwarn0'] = ''
+        replace_dict['eftwarn1'] = ''
+        ###if 'eft' in self.model_name.lower():
+        ###    replace_dict['eftwarn0'] = '\n//#warning Support for EFT physics models is still limited for HRDCOD=0 builds (#439 and PR #625)'
+        ###    replace_dict['eftwarn1'] = '\n//#warning Support for EFT physics models is still limited for HRDCOD=1 builds (#439 and PR #625)'
+        if len( bsmparam_indep_real_used ) + len( bsmparam_indep_complex_used ) == 0:
+            replace_dict['eftspecial0'] = '\n      // No special handling of non-hardcoded parameters (no additional BSM parameters needed in constant memory)'
+        else:
+            replace_dict['eftspecial0'] = ''
+            for ipar, par in enumerate( bsmparam_indep_real_used ) : replace_dict['eftspecial0'] += '\n      const double %s = bsmIndepParamPtr[%i];' % ( par, ipar )
+            for ipar, par in enumerate( bsmparam_indep_complex_used ) : replace_dict['eftspecial0'] += '\n      const cxsmpl<double> %s = cxsmpl<double>( bsmIndepParamPtr[%i], bsmIndepParamPtr[%i] );' % ( par, 2*ipar, 2*ipar+1 )
+        file_h = self.read_template_file(self.param_template_h) % replace_dict
+        file_cc = self.read_template_file(self.param_template_cc) % replace_dict
+        return file_h, file_cc
+
+    def write_parameter_class_files(self):
+        # Rename Parameters_%(model_name).h/cc to Parameters.h/cc
+        super().write_parameter_class_files()
+
+        # compute the paths the legacy method wrote
+        h_dir = os.path.join(self.dir_path, self.include_dir)
+        cc_dir = os.path.join(self.dir_path, self.cc_file_dir)
+
+        src_h = os.path.join(h_dir, "Parameters_%s.h" % self.model_name)
+        src_cc = os.path.join(cc_dir, "Parameters_%s.cc" % self.model_name)
+
+        dst_h = os.path.join(h_dir, "Parameters.h")
+        dst_cc = os.path.join(cc_dir, "Parameters.cc")
+
+        if os.path.exists(src_h):
+            os.replace(src_h, dst_h)
+        if os.path.exists(src_cc):
+            os.replace(src_cc, dst_cc)
+
+    # AV - overload export_cpp.UFOModelConverterCPP method (improve formatting)
+    def generate_parameters_class_files(self):
+        ###file_h, file_cc = super().generate_parameters_class_files()
+        file_h, file_cc = self.super_generate_parameters_class_files()
+        file_h = file_h[:-1] # remove extra trailing '\n'
+        file_cc = file_cc[:-1] # remove extra trailing '\n'
+        # [NB: there is a minor bug in export_cpp.UFOModelConverterCPP.generate_parameters_class_files
+        # ['independent_couplings' contains dependent parameters, 'dependent parameters' contains independent_couplings]
+        # [This only affects the order in which they are printed out - which is now reversed in the templates]
+        # [This has been reported as bug https://bugs.launchpad.net/mg5amcnlo/+bug/1959192]
+        return file_h, file_cc
+
+    # AV - replace export_cpp.UFOModelConverterCPP method (add explicit std namespace)
+    def write_print_parameters(self, params):
+        """Write out the lines of independent parameters"""
+        # For each parameter, write name = expr;
+        res_strings = []
+        for param in params:
+            res_strings.append('std::cout << std::setw( 20 ) << \"%s = \" << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << %s << std::endl;' % (param.name, param.name)) # AV
+        if len(res_strings) == 0 : res_strings.append('// (none)')
+        ##return '\n'.join(res_strings)
+        return '\n  '.join(res_strings) # AV (why was this not necessary before?)
+
+    # AV - replace export_cpp.UFOModelConverterCPP method (add debug printouts)
+    # (This is where the loop over FFV functions takes place - I had a hard time to understand it)
+    # (Note also that write_combined_cc seems to never be called for our eemumu and ggttgg examples)
+    # The calling sequence is the following (understood via MG5_debug after forcing an error by renaming 'write')
+    # - madgraph_interface.py 8369 in finalize => self._curr_exporter.convert_model(self._curr_model
+    # - output.py 127 in convert_model => super().convert_model(model, wanted_lorentz, wanted_coupling)
+    # - export_cpp.py 2503 in convert_model => model_builder.write_files()
+    # - export_cpp.py 128 in write_files => self.write_aloha_routines()
+    # - export_cpp.py 392 in write_aloha_routines => h_rout, cc_rout = abstracthelas.write(output_dir=None,
+    # - create_aloha.py 97 in write => text = writer.write(mode=mode, **opt)
+    #   [this is PLUGIN_ALOHAWriter.write which defaults to ALOHAWriterForCPP.write]
+    #   [therein, cc_text comes from WriteALOHA.write, while h_text comes from get_h_text]
+    def write_aloha_routines(self):
+        """Generate the hel_amps_model.h and hel_amps_model.cc files, which
+        have the complete set of generalized Helas routines for the model"""
+        import aloha.create_aloha as create_aloha
+        if not os.path.isdir(os.path.join(self.dir_path, self.include_dir)):
+            os.makedirs(os.path.join(self.dir_path, self.include_dir))
+        if not os.path.isdir(os.path.join(self.dir_path, self.cc_file_dir)):
+            os.makedirs(os.path.join(self.dir_path, self.cc_file_dir))
+        model_h_file = os.path.join(self.dir_path, self.include_dir,
+                                    'HelAmps_%s.h' % self.model_name)
+        model_cc_file = os.path.join(self.dir_path, self.cc_file_dir,
+                                     'HelAmps_%s.%s' % (self.model_name, self.cc_ext))
+        replace_dict = {}
+        replace_dict['output_name'] = self.output_name
+        replace_dict['info_lines'] = PLUGIN_export_cpp.get_mg5_info_lines()
+        replace_dict['namespace'] = self.namespace
+        replace_dict['model_name'] = self.model_name
+        # Read in the template .h and .cc files, stripped of compiler commands and namespaces
+        template_h_files = self.read_aloha_template_files(ext = 'h')
+        template_cc_files = self.read_aloha_template_files(ext = 'cc')
+        aloha_model = create_aloha.AbstractALOHAModel(self.model.get('name'), explicit_combine=True)
+        aloha_model.add_Lorentz_object(self.model.get('lorentz'))
+        if self.wanted_lorentz:
+            aloha_model.compute_subset(self.wanted_lorentz)
+        else:
+            aloha_model.compute_all(save=False, custom_propa=True)
+        for abstracthelas in dict(aloha_model).values():
+            print(type(abstracthelas), abstracthelas.name) # AV this is the loop on FFV functions
+            h_rout, cc_rout = abstracthelas.write(output_dir=None, language=self.aloha_writer, mode='no_include')
+            template_h_files.append(h_rout)
+            template_cc_files.append(cc_rout)
+        replace_dict['function_declarations'] = '\n'.join(template_h_files)
+        replace_dict['function_definitions'] = '\n'.join(template_cc_files)
+        file_h = self.read_template_file(self.aloha_template_h) % replace_dict
+        file_cc = self.read_template_file(self.aloha_template_cc) % replace_dict
+        file_cc = '\n'.join( file_cc.split('\n')[9:] ) # skip first 9 lines in cpp_hel_amps_cc.inc (copyright including ALOHA)
+        # Write the HelAmps_sm.h and HelAmps_sm.cc files
+        ###PLUGIN_writers.CPPWriter(model_h_file).writelines(file_h)
+        ###PLUGIN_writers.CPPWriter(model_cc_file).writelines(file_cc)
+        ###logger.info('Created files %s and %s in directory' \
+        ###            % (os.path.split(model_h_file)[-1],
+        ###               os.path.split(model_cc_file)[-1]))
+        ###logger.info('%s and %s' % \
+        ###            (os.path.split(model_h_file)[0],
+        ###             os.path.split(model_cc_file)[0]))
+        # Write only the HelAmps_sm.h file
+        file_h_lines = file_h.split('\n')
+        file_h = '\n'.join( file_h_lines[:-3]) # skip the trailing '//---'
+        file_h += file_cc # append the contents of HelAmps_sm.cc directly to HelAmps_sm.h!
+        file_h = file_h[:-1] # skip the trailing empty line
+        PLUGIN_writers.CPPWriter(model_h_file).writelines(file_h)
+        logger.info('Created file %s in directory %s' \
+                    % (os.path.split(model_h_file)[-1], os.path.split(model_h_file)[0] ) )
+
+    def prepare_couplings(self, wanted_couplings = []):
+        super().prepare_couplings(wanted_couplings)
+        # the two lines below fix #748, i.e. they re-order the dictionary keys following the order in wanted_couplings
+
+        def all_str(wanted_couplings):
+            str_repr = []
+            for coup in wanted_couplings:
+                if isinstance(coup, base_objects.FLV_Coupling):
+                    str_repr.append(coup.name)
+                else:
+                    str_repr.append(coup)
+            return str_repr
+
+        running_wanted_couplings = [value for value in all_str(wanted_couplings) if value in self.coups_dep]
+        ordered_dict = [(k, self.coups_dep[k]) for k in running_wanted_couplings]
+        self.coups_dep = dict((x, y) for x, y in ordered_dict)
+
+#------------------------------------------------------------------------------------
+
+import madgraph.iolibs.files as files
+import madgraph.various.misc as misc
+import madgraph.iolibs.export_v4 as export_v4
+import madgraph.core.base_objects as base_objects
+# AV - define a custom OneProcessExporter
+# (NB: enable this via PLUGIN_ProcessExporter.oneprocessclass in output.py)
+# (NB: use this directly also in PLUGIN_UFOModelConverter.read_template_file)
+# (NB: use this directly also in PLUGIN_GPUFOHelasCallWriter.super_get_matrix_element_calls)
+class PLUGIN_OneProcessExporter(PLUGIN_export_cpp.OneProcessExporterGPU):
+    # Class structure information
+    #  - object
+    #  - OneProcessExporterCPP(object) [in madgraph/iolibs/export_cpp.py]
+    #  - OneProcessExporterGPU(OneProcessExporterCPP) [in madgraph/iolibs/export_cpp.py]
+    #  - PLUGIN_OneProcessExporter(OneProcessExporterGPU)
+    #      This class
+
+    # AV - change defaults from export_cpp.OneProcessExporterGPU
+    # [NB process_class = "CPPProcess" is set in OneProcessExporterCPP.__init__]
+    # [NB process_class = "gCPPProcess" is set in OneProcessExporterGPU.__init__]
+    cc_ext = 'cc' # create CPPProcess.cc (build it also as CPPProcess_cu.so, no longer symlink it as gCPPProcess.cu)
+
+    # AV - keep defaults from export_cpp.OneProcessExporterGPU
+    ###process_dir = '.'
+    ###include_dir = '.'
+    ###process_template_h = 'gpu/process_h.inc'
+    ###process_template_cc = 'gpu/process_cc.inc'
+    ###process_class_template = 'gpu/process_class.inc'
+    ###process_definition_template = 'gpu/process_function_definitions.inc'
+    ###process_wavefunction_template = 'cpp_process_wavefunctions.inc'
+    ###process_sigmaKin_function_template = 'gpu/process_sigmaKin_function.inc'
+    ###single_process_template = 'gpu/process_matrix.inc'
+    ###support_multichannel = False
+    ###multichannel_var = ',fptype& multi_chanel_num, fptype& multi_chanel_denom'
+
+    # AV - use template files from PLUGINDIR instead of MG5DIR
+    ###template_path = os.path.join(_file_path, 'iolibs', 'template_files')
+    ###__template_path = os.path.join(_file_path, 'iolibs', 'template_files')
+    template_path = os.path.join( PLUGINDIR, 'madgraph', 'iolibs', 'template_files' )
+    __template_path = os.path.join( PLUGINDIR, 'madgraph', 'iolibs', 'template_files' )
+
+    # AV - overload export_cpp.OneProcessExporterGPU constructor (rename gCPPProcess to CPPProcess, set include_multi_channel)
+    def __init__(self, *args, **kwargs):
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.__init__')
+        for kwarg in kwargs: misc.sprint( 'kwargs[%s] = %s' %( kwarg, kwargs[kwarg] ) )
+        super().__init__(*args, **kwargs)
+        self.process_class = 'CPPProcess'
+        ###if self.in_madevent_mode: proc_id = kwargs['prefix']+1 # madevent+cudacpp (NB: HERE SELF.IN_MADEVENT_MODE DOES NOT WORK!)
+        if 'prefix' in kwargs: proc_id = kwargs['prefix']+1 # madevent+cudacpp (ime+1 from ProcessExporterFortranMEGroup.generate_subprocess_directory)
+        else: proc_id = 0 # standalone_cudacpp
+        ###misc.sprint(proc_id)
+        self.proc_id = proc_id
+
+    # AV - overload export_cpp.OneProcessExporterGPU method (indent comments in process_lines)
+    def get_process_class_definitions(self, write=True):
+        replace_dict = super().get_process_class_definitions(write=False)
+        replace_dict['process_lines'] = replace_dict['process_lines'].replace('\n','\n  ')
+        ###misc.sprint( replace_dict['nwavefuncs'] ) # NB: this (from export_cpp) is the WRONG value of nwf, e.g. 6 for gg_tt (#644)
+        ###misc.sprint( self.matrix_elements[0].get_number_of_wavefunctions() ) # NB: this is a different WRONG value of nwf, e.g. 7 for gg_tt (#644)
+        ###replace_dict['nwavefunc'] = self.matrix_elements[0].get_number_of_wavefunctions() # how do I get HERE the right value of nwf, e.g. 5 for gg_tt?
+        nexternal, nincoming = self.matrix_elements[0].get_nexternal_ninitial()
+        replace_dict['nincoming'] = nincoming
+        replace_dict['noutcoming'] = nexternal - nincoming
+        replace_dict['nbhel'] = self.matrix_elements[0].get_helicity_combinations() # number of helicity combinations
+        replace_dict['ndiagrams'] = len(self.matrix_elements[0].get('diagrams')) # AV FIXME #910: elsewhere matrix_element.get('diagrams') and max(config[0]...
+        replace_dict['nmaxflavor'] = len(self.matrix_elements[0].get_external_flavors_with_iden()) # number of flavor combinations
+        if( write ): # ZW: added dict return for uses in child exporters. Default argument is True so no need to modify other calls to this function
+            file = self.read_template_file(self.process_class_template) % replace_dict 
+            file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_class.inc (copyright)
+            return file
+        else:
+            return replace_dict
+
+    # AV - replace export_cpp.OneProcessExporterGPU method (fix CPPProcess.cc)
+    def get_process_function_definitions(self, write=True):
+        """The complete class definition for the process"""
+        replace_dict = super(PLUGIN_export_cpp.OneProcessExporterGPU,self).get_process_function_definitions(write=False) # defines replace_dict['initProc_lines']
+        replace_dict['hardcoded_initProc_lines'] = replace_dict['initProc_lines'].replace( 'm_pars->', 'Parameters::')
+        couplings2order_indep = []
+        ###replace_dict['ncouplings'] = len(self.couplings2order)
+        ###replace_dict['ncouplingstimes2'] = 2 * replace_dict['ncouplings']
+        replace_dict['nparams'] = len(self.params2order)
+        ###replace_dict['nmodels'] = replace_dict['nparams'] + replace_dict['ncouplings'] # AV unused???
+        replace_dict['coupling_list'] = ' '
+        replace_dict['hel_amps_cc'] = '#include \"HelAmps_%s.cc\"' % self.model_name # AV
+        coupling = [''] * len(self.couplings2order)
+        params = [''] * len(self.params2order)
+        flv_couplings = [''] * len(self.couporderflv)
+        for coup, pos in self.couplings2order.items():
+            coupling[pos] = coup
+        for para, pos in self.params2order.items():
+            params[pos] = para
+        coupling_indep = [] # AV keep only the alphas-independent couplings #434
+        for coup in coupling:
+            keep = True
+            # Use the same implementation as in UFOModelConverterCPP.prepare_couplings (assume self.model is the same)
+            for key, coup_list in self.model['couplings'].items():
+                if "aS" in key and coup in coup_list: keep = False
+            if keep: coupling_indep.append( coup ) # AV only indep!
+        replace_dict['ncouplings'] = len(coupling_indep) # AV only indep!
+        replace_dict['nipc'] = len(coupling_indep)
+        if len(coupling_indep) > 0:
+            replace_dict['cipcassign'] = 'const cxtype tIPC[nIPC] = { cxmake( m_pars->%s ) };'\
+                                         % ( ' ), cxmake( m_pars->'.join(coupling_indep) ) # AV only indep!
+            replace_dict['cipcdevice'] = '__device__ __constant__ fptype cIPC[nIPC * 2];'
+            replace_dict['cipcstatic'] = 'static fptype cIPC[nIPC * 2];'
+            replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, nIPC * sizeof( cxtype ) );'
+            replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, nIPC * sizeof( cxtype ) );'
+            replace_dict['cipcdump'] = '\n    //for ( int i=0; i<nIPC; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'
+            coup_str_hrd = '__device__ const fptype cIPC[nIPC * 2] = { '
+            for coup in coupling_indep : coup_str_hrd += '(fptype)Parameters::%s.real(), (fptype)Parameters::%s.imag(), ' % ( coup, coup ) # AV only indep!
+            coup_str_hrd = coup_str_hrd[:-2] + ' };'
+            replace_dict['cipchrdcod'] = coup_str_hrd
+        else:
+            replace_dict['cipcassign'] = '//const cxtype tIPC[0] = { ... }; // nIPC=0'
+            replace_dict['cipcdevice'] = '__device__ __constant__ fptype* cIPC = nullptr; // unused as nIPC=0'
+            replace_dict['cipcstatic'] = 'static fptype* cIPC = nullptr; // unused as nIPC=0'
+            replace_dict['cipc2tipcSym'] = '//gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nIPC=0'
+            replace_dict['cipc2tipc'] = '//memcpy( cIPC, tIPC, nIPC * sizeof( cxtype ) ); // nIPC=0'
+            replace_dict['cipcdump'] = ''
+            replace_dict['cipchrdcod'] = '__device__ const fptype* cIPC = nullptr; // unused as nIPC=0'
+        replace_dict['nipd'] = len(params)
+        if len(params) > 0:
+            replace_dict['cipdassign'] = 'const fptype tIPD[nIPD] = { (fptype)m_pars->%s };'\
+                                         %( ', (fptype)m_pars->'.join(params) )
+            replace_dict['cipddevice'] = '__device__ __constant__ fptype cIPD[nIPD];'
+            replace_dict['cipdstatic'] = 'static fptype cIPD[nIPD];'
+            replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, nIPD * sizeof( fptype ) );'
+            replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, nIPD * sizeof( fptype ) );'
+            replace_dict['cipddump'] = '\n    //for ( int i=0; i<nIPD; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'
+            param_str_hrd = '__device__ const fptype cIPD[nIPD] = { '
+            for para in params : param_str_hrd += '(fptype)Parameters::%s, ' % ( para )
+            param_str_hrd = param_str_hrd[:-2] + ' };'
+            replace_dict['cipdhrdcod'] = param_str_hrd
+        else:
+            replace_dict['cipdassign'] = '//const fptype tIPD[0] = { ... }; // nIPD=0'
+            replace_dict['cipddevice'] = '//__device__ __constant__ fptype* cIPD = nullptr; // unused as nIPD=0'
+            replace_dict['cipdstatic'] = '//static fptype* cIPD = nullptr; // unused as nIPD=0'
+            replace_dict['cipd2tipdSym'] = '//gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nIPD=0'
+            replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, nIPD * sizeof( fptype ) ); // nIPD=0'
+            replace_dict['cipddump'] = ''
+            replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nIPD=0'
+
+        # flavor couplings
+        for flv_coup, pos in self.couporderflv.items():
+            flv_couplings[pos] = flv_coup
+        replace_dict['nipf'] = len(flv_couplings)
+        if len(flv_couplings):
+            nMF = max(len(ids) for ids in self.model['merged_particles'].values())
+            # we have 3 arrays:
+            #  - all partner1 arrays combined
+            #  - all partner2 arrays combines
+            #  - all value arrays combined
+            replace_dict['cipfassign'] = """int tIPF_partner1[nMF * nIPF];
+    int tIPF_partner2[nMF * nIPF];
+    cxtype tIPF_value[nMF * nIPF];
+    const FLV_COUPLING tFLV[nIPF] = { m_pars->%s };
+    for (int i = 0; i < nIPF; ++i) {
+      memcpy( tIPF_partner1 + i * nMF, tFLV[i].partner1, nMF * sizeof( int ) );
+      memcpy( tIPF_partner2 + i * nMF, tFLV[i].partner2, nMF * sizeof( int ) );
+      for (int j = 0; j < nMF; ++j)
+        tIPF_value[i * nMF + j] = *tFLV[i].value[j] ? *tFLV[i].value[j] : cxtype{}; // guard from null pointers
+    }""" % ( ', m_pars->'.join(flv_couplings) )
+            replace_dict['cipfdevice'] = """__device__ __constant__ int cIPF_partner1[nMF * nIPF];
+  __device__ __constant__ int cIPF_partner2[nMF * nIPF];
+  __device__ __constant__ fptype cIPF_value[nMF * nIPF * 2];"""
+            replace_dict['cipfstatic'] = """static int cIPF_partner1[nMF * nIPF];
+  static int cIPF_partner2[nMF * nIPF];
+  static fptype cIPF_value[nMF * nIPF * 2];"""
+            replace_dict['cipf2tipfSym'] = """gpuMemcpyToSymbol( cIPF_partner1, tIPF_partner1, nMF * nIPF * sizeof( int )    );
+    gpuMemcpyToSymbol( cIPF_partner2, tIPF_partner2, nMF * nIPF * sizeof( int )    );
+    gpuMemcpyToSymbol( cIPF_value   , tIPF_value   , nMF * nIPF * sizeof( cxtype ) );"""
+            replace_dict['cipf2tipf'] = """memcpy( cIPF_partner1, tIPF_partner1, nMF * nIPF * sizeof( int )    );
+    memcpy( cIPF_partner2, tIPF_partner2, nMF * nIPF * sizeof( int )    );
+    memcpy( cIPF_value   , tIPF_value   , nMF * nIPF * sizeof( cxtype ) );"""
+            replace_dict['cipfdump'] = '''
+    //for ( int i=0; i < nIPD; i++ ) {
+    //  std::cout << std::setprecision(17) << "tIPF[i].partner1 = { ";
+    //  for ( int j=0; j < nMF-1; j++ ) std::cout << std::setprecision(17) << tIPF[i].partner1[j] << ", ";
+    //  std::cout << std::setprecision(17) << tIPF[i].partner1[nMF-1] << " }" << std::endl;
+    //  std::cout << std::setprecision(17) << "tIPF[i].partner2 = { ";
+    //  for ( int j=0; j < nMF-1; j++ ) std::cout << std::setprecision(17) << tIPF[i].partner2[j] << ", ";
+    //  std::cout << std::setprecision(17) << tIPF[i].partner2[nMF-1] << " }" << std::endl;
+    //  std::cout << std::setprecision(17) << "tIPF[i].value = { ";
+    //  for ( int j=0; j < nMF-1; j++ ) std::cout << std::setprecision(17) << tIPF[i].value[j] << ", ";
+    //  std::cout << std::setprecision(17) << tIPF[i].value[nMF-1] << " }" << std::endl;
+    //}
+'''
+            coup_str_hrd_partner1 = '__device__ const int cIPF_partner1[nMF * nIPF] = { '
+            coup_str_hrd_partner2 = '__device__ const int cIPF_partner2[nMF * nIPF] = { '
+            coup_str_hrd_value    = '__device__ const fptype cIPF_value[nMF * nIPF * 2] = { '
+            for flv_coup in flv_couplings:
+                coup_str_hrd_partner1 += ( ('Parameters_%(model_name)s::%(coup)s.param1' % {"model_name": self.model_name, "coup": flv_coup} + '[%d], ') * nMF) % ( *range(nMF), )
+                coup_str_hrd_partner2 += ( ('Parameters_%(model_name)s::%(coup)s.param2' % {"model_name": self.model_name, "coup": flv_coup} + '[%d], ') * nMF) % ( *range(nMF), )
+                value_string = '(fptype)Parameters_%(model_name)s::%(coup)s.value' % {"model_name": self.model_name, "coup": flv_coup}
+                range_ids = [ [ i, i ] for i in range(nMF) ]
+                coup_str_hrd_value += ( ( value_string + '[%d].real(), ' + value_string + '[%d].imag(), ' ) * nMF) % ( *[ j for i in range_ids for j in i ], )
+            coup_str_hrd_partner1 = coup_str_hrd_partner1[:-2] + ' };'
+            coup_str_hrd_partner2 = coup_str_hrd_partner2[:-2] + ' };'
+            coup_str_hrd_value    = coup_str_hrd_value[:-2] + ' };'
+            replace_dict['cipfhrdcod'] = '%s\n  %s\n  %s' % (coup_str_hrd_partner1, coup_str_hrd_partner2, coup_str_hrd_value)
+        else:
+            replace_dict['cipfassign'] = ''
+            replace_dict['cipfdevice'] = """__device__ __constant__ int* cIPF_partner1 = nullptr; // unused as nIPF=0'
+    __device__ __constant__ int* cIPF_partner2 = nullptr; // unused as nIPF=0'
+    __device__ __constant__ fptype* cIPF_value = nullptr; // unused as nIPF=0'"""
+            replace_dict['cipfstatic'] = """static int* cIPF_partner1 = nullptr; // unused as nIPF=0'
+    static int* cIPF_partner2 = nullptr; // unused as nIPF=0'
+    static fptype* cIPF_value = nullptr; // unused as nIPF=0'"""
+            replace_dict['cipf2tipfSym'] = ''
+            replace_dict['cipf2tipf'] = ''
+            replace_dict['cipfdump'] = ''
+            replace_dict['cipfhrdcod'] = """__device__ const int* cIPF_partner1 = nullptr; // unused as nIPF=0'
+    __device__ const int* cIPF_partner2 = nullptr; // unused as nIPF=0'
+    __device__ const fptype* cIPF_value = nullptr; // unused as nIPF=0'"""
+        # FIXME! Here there should be different code generated depending on MGONGPUCPP_NBSMINDEPPARAM_GT_0 (issue #827)
+        replace_dict['all_helicities'] = self.get_helicity_matrix(self.matrix_elements[0])
+        replace_dict['all_helicities'] = replace_dict['all_helicities'] .replace('helicities', 'tHel')
+        replace_dict['all_flavors'] = self.get_flavor_matrix(self.matrix_elements[0])
+        replace_dict['all_flavors'] = replace_dict['all_flavors'].replace('flavors', 'tFlavors')
+        color_amplitudes = [me.get_color_amplitudes() for me in self.matrix_elements] # as in OneProcessExporterCPP.get_process_function_definitions
+        replace_dict['ncolor'] = len(color_amplitudes[0])
+        # broken_symmetry_factor function
+        data = self.matrix_elements[0].get('processes')[0].get_final_ids_after_decay()
+        pids = str(data).replace('[', '{').replace(']', '}')
+        replace_dict['get_pid'] = 'int pid[] = %s;' % (pids)
+        replace_dict['get_old_symmmetry_value'] = 1
+        done = []
+        for value in data:
+            if value not in done:
+                done.append(value)
+                replace_dict['get_old_symmmetry_value'] *= math.factorial(data.count(value)) 
+        _, nincoming = self.matrix_elements[0].get_nexternal_ninitial()
+        replace_dict['nincoming'] = nincoming
+
+        file = self.read_template_file(self.process_definition_template) % replace_dict # HACK! ignore write=False case
+        if len(params) == 0: # remove cIPD from OpenMP pragma (issue #349)
+            file_lines = file.split('\n')
+            file_lines = [l.replace('cIPC, cIPD','cIPC') for l in file_lines] # remove cIPD from OpenMP pragma
+            file = '\n'.join( file_lines )
+        file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_function_definitions.inc (copyright)
+        return file
+
+    # AV - modify export_cpp.OneProcessExporterGPU method (add debug printouts for multichannel #342)
+    def get_sigmaKin_lines(self, color_amplitudes, write=True):
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines')
+        ###misc.sprint(self.include_multi_channel)
+        ###misc.sprint(self.support_multichannel)
+        replace_dict = super().get_sigmaKin_lines(color_amplitudes, write=False)
+        replace_dict['proc_id'] = self.proc_id if self.proc_id>0 else 1
+        replace_dict['proc_id_source'] = 'madevent + cudacpp exporter' if self.proc_id>0 else 'standalone_cudacpp' # FIXME? use self.in_madevent_mode instead?
+
+        # Extract denominator (avoid to extend size for mirroring)
+        den_factors = [str(me.get_denominator_factor()) for me in \
+                            self.matrix_elements]
+        replace_dict['den_factors'] = ",".join(den_factors)
+
+        if write:
+            file = self.read_template_file(self.process_sigmaKin_function_template) % replace_dict
+            file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_sigmaKin_function.inc (copyright)
+            return file, replace_dict
+        else:
+            return replace_dict
+
+    # AV - modify export_cpp.OneProcessExporterGPU method (fix CPPProcess.cc)
+    def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
+        """Get sigmaKin_process for all subprocesses for CPPProcess.cc"""
+        ret_lines = []
+        if self.single_helicities:
+            ###assert self.include_multi_channel # remove this assert: must handle both cases and produce two different code bases (#473)
+            ###misc.sprint(type(self.helas_call_writer))
+            ###misc.sprint(self.support_multichannel, self.include_multi_channel)
+            multi_channel = None
+            if self.include_multi_channel:
+                if not self.support_multichannel:
+                    raise Exception("link with madevent not supported")
+                multi_channel = self.get_multi_channel_dictionary(self.matrix_elements[0].get('diagrams'), self.include_multi_channel)
+                ###misc.sprint(multi_channel)
+            ###misc.sprint( 'before get_matrix_element_calls', self.matrix_elements[0].get_number_of_wavefunctions() ) # WRONG value of nwf, eg 7 for gg_tt
+            helas_calls = self.helas_call_writer.get_matrix_element_calls(\
+                                                    self.matrix_elements[0],
+                                                    color_amplitudes[0],
+                                                    multi_channel_map = multi_channel
+                                                    )
+            ###misc.sprint( 'after get_matrix_element_calls', self.matrix_elements[0].get_number_of_wavefunctions() ) # CORRECT value of nwf, eg 5 for gg_tt
+            assert len(self.matrix_elements) == 1 # how to handle if this is not true?
+            self.couplings2order = self.helas_call_writer.couplings2order
+            self.couporderflv = self.helas_call_writer.couporderflv
+            self.params2order = self.helas_call_writer.params2order
+            ret_lines.append("""
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+  __global__ void /* clang-format off */
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   const unsigned int* iflavorVec,    // input: indices of the flavor combinations
+#ifdef MGONGPUCPP_GPUIMPL
+                   fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   bool storeChannelWeights,
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+#endif
+                   const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#else
+                   cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   bool storeChannelWeights,
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+                   fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#endif
+                   const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                   )
+  //ALWAYS_INLINE // attributes are not permitted in a function definition
+  {
+#ifdef MGONGPUCPP_GPUIMPL
+    using namespace mg5amcGpu;
+    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
+    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using F_ACCESS = DeviceAccessIflavorVec;      // non-trivial access: buffer includes all events
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+#endif
+#else
+    using namespace mg5amcCpu;
+    using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
+    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using F_ACCESS = HostAccessIflavorVec;      // non-trivial access: buffer includes all events
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+#endif
+#endif
+    mgDebug( 0, __FUNCTION__ );
+    //bool debug = true;
+#ifndef MGONGPUCPP_GPUIMPL
+    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
+    //if( debug ) printf( \"calculate_jamps: ievt00=%d ihel=%2d\\n\", ievt00, ihel );
+#else
+    //const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    //debug = ( ievt == 0 );
+    //if( debug ) printf( \"calculate_jamps: ievt=%6d ihel=%2d\\n\", ievt, ihel );
+#endif /* clang-format on */""")
+            nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
+            ret_lines.append("""
+    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
+    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
+    static const int nwf = %i; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)"""%nwavefuncs )
+            ret_lines.append("""
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
+    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
+    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
+    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // Create memory for both momenta and wavefunctions separately, and later wrap them in ALOHAOBJ
+    fptype_sv pvec_sv[nwf][np4];
+    cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is 4: spin wavefunctions, momenta are no more included, see before)
+    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
+
+    // Wrap the memory into ALOHAOBJ
+    ALOHAOBJ aloha_obj[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) aloha_obj[iwf] = ALOHAOBJ{pvec_sv[iwf], w_sv[iwf]};
+    fptype* amp_fp;
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
+    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
+    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+
+    // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
+    // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
+
+    // START LOOP ON IPARITY
+    for( int iParity = 0; iParity < nParity; ++iParity )
+    {
+#ifndef MGONGPUCPP_GPUIMPL
+      const int ievt0 = ievt00 + iParity * neppV;
+#endif""")
+            ret_lines += helas_calls
+        else:
+            ret_lines.extend([self.get_sigmaKin_single_process(i, me) \
+                                  for i, me in enumerate(self.matrix_elements)])
+        #ret_lines.extend([self.get_matrix_single_process(i, me,
+        #                                                 color_amplitudes[i],
+        #                                                 class_name) \
+        #                        for i, me in enumerate(self.matrix_elements)])
+        file_extend = []
+        for i, me in enumerate(self.matrix_elements):
+            file = self.get_matrix_single_process( i, me, color_amplitudes[i], class_name )
+            file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_matrix.inc (copyright)
+            file_extend.append( file )
+            assert i == 0, "more than one ME in get_all_sigmaKin_lines" # AV sanity check (added for color_sum.cc but valid independently)
+        ret_lines.extend( file_extend )
+        return '\n'.join(ret_lines)
+
+    # AV - modify export_cpp.OneProcessExporterGPU method (replace '# Process' by '// Process')
+    def get_process_info_lines(self, matrix_element):
+        """Return info lines describing the processes for this matrix element"""
+        ###return'\n'.join([ '# ' + process.nice_string().replace('\n', '\n# * ') \
+        ###                 for process in matrix_element.get('processes')])
+        return'\n'.join([ '// ' + process.nice_string().replace('\n', '\n// * ') \
+                         for process in matrix_element.get('processes')])
+
+    # AV - replace the export_cpp.OneProcessExporterGPU method (invert .cc/.cu, add debug printouts)
+    def generate_process_files(self):
+        """Generate mgOnGpuConfig.h, CPPProcess.cc, CPPProcess.h, check_sa.cc, gXXX.cu links"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.generate_process_files')
+        ###if self.include_multi_channel:
+        ###    misc.sprint('self.include_multi_channel is already defined: this is madevent+second_exporter mode') # FIXME? use self.in_madevent_mode instead?
+        if not self.include_multi_channel:
+            ###misc.sprint('self.include_multi_channel is not yet defined: this is standalone_cudacpp mode') # see issue #473
+            # AV: needed for (moved to?) standalone_cudacpp mode (but do we need those lines at all???)
+            # OM: this condition is likely wrong and need to be removed
+            if self.matrix_elements[0].get('has_mirror_process'):
+                self.matrix_elements[0].set('has_mirror_process', False)
+                self.nprocesses/=2
+        super(PLUGIN_export_cpp.OneProcessExporterGPU, self).generate_process_files()
+        self.edit_CMakeLists()
+        self.edit_check_sa()
+        self.edit_mgonGPU()
+        self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
+        self.edit_processConfig() # sub process specific, not to be symlinked from the Subprocesses directory
+        self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
+        self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
+        self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
+        self.edit_memoryaccesscouplings() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
+        # NB: symlink of cudacpp.mk to makefile is overwritten by madevent makefile if this exists (#480)
+        # NB: this relies on the assumption that cudacpp code is generated before madevent code
+        files.ln(pjoin(self.path, 'cudacpp.mk'), self.path, 'makefile')
+        # Add link to makefile_original.mk, PR #1052
+        files.ln(pjoin(self.path, '..', 'makefile_original.mk'), self.path, 'makefile_original.mk')
+        # Add symbolic links in the test directory
+        files.ln(pjoin(self.path + '/../../test', 'cudacpp_test.mk'), self.path + '/../../test', 'makefile')
+        # Add reference file in the test directory (if it exists for this process)
+        import pathlib
+        pathlib.Path(self.path + '/../../test/ref/.keepme').touch()
+        ###template_ref = 'dump_CPUTest.'+self.process_name+'.txt'
+        template_ref = self.template_path + '/../../../test/ref/' + 'dump_CPUTest.' + self.process_name + '.txt'
+        for ref in template_ref, template_ref + '2' : # two different reference files for tests without/with multichannel #896
+            if os.path.exists( ref ):
+                ###misc.sprint( 'Copying test reference file: ', ref )
+                PLUGIN_export_cpp.cp( ref, self.path + '/../../test/ref' )
+            ###else:
+                ###misc.sprint( 'Test reference file does not exist and will not be copied: ', ref )
+
+    # SR - generate CMakeLists.txt file inside the P* directory
+    def edit_CMakeLists(self):
+        """Generate CMakeLists.txt"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_CMakeLists')
+        template = open(pjoin(self.template_path,'CMake/SubProcesses/CMakeLists_P.txt'),'r').read()
+        ff = open(pjoin(self.path, 'CMakeLists.txt'),'w')
+        ff.write(template)
+        ff.close()
+
+    # AV - replace the export_cpp.OneProcessExporterGPU method (invert .cc/.cu, add debug printouts)
+    def edit_check_sa(self):
+        """Generate check_sa.cc and fcheck_sa.f"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_check_sa')
+        ff = open(pjoin(self.path, 'check_sa.cc'),'w')
+        template = open(pjoin(self.template_path,'gpu','check_sa.cc'),'r').read()
+        ff.write(template) # nothing to replace in check_sa.cc
+        ff.close()
+        replace_dict = {}
+        replace_dict['nexternal'], _ = self.matrix_elements[0].get_nexternal_ninitial()
+        ###replace_dict['model'] = self.model_name
+        ###replace_dict['numproc'] = len(self.matrix_elements)
+        ff = open(pjoin(self.path, 'fcheck_sa.f'),'w')
+        template = open(pjoin(self.template_path,'gpu','fcheck_sa.f'),'r').read()
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - replace the export_cpp.OneProcessExporterGPU method (add debug printouts and multichannel handling #473) 
+    def edit_mgonGPU(self):
+        """Generate mgOnGpuConfig.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_mgonGPU')
+        template = open(pjoin(self.template_path,'gpu','mgOnGpuConfig.h'),'r').read()
+        replace_dict = {}
+        nexternal, nincoming = self.matrix_elements[0].get_nexternal_ninitial()
+        replace_dict['nincoming'] = nincoming
+        replace_dict['noutcoming'] = nexternal - nincoming
+        replace_dict['nbhel'] = self.matrix_elements[0].get_helicity_combinations() # number of helicity combinations
+        ###replace_dict['nwavefunc'] = self.matrix_elements[0].get_number_of_wavefunctions() # this is the correct P1-specific nwf, now in CPPProcess.h (#644)
+        replace_dict['wavefuncsize'] = 6
+        if self.include_multi_channel:
+            replace_dict['mgongpu_supports_multichannel'] = '#define MGONGPU_SUPPORTS_MULTICHANNEL 1'
+        else:
+            replace_dict['mgongpu_supports_multichannel'] = '#undef MGONGPU_SUPPORTS_MULTICHANNEL'
+        ff = open(pjoin(self.path, '..','..','src','mgOnGpuConfig.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - new method
+    def edit_processidfile(self):
+        """Generate epoch_process_id.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_processidfile')
+        template = open(pjoin(self.template_path,'gpu','epoch_process_id.h'),'r').read()
+        replace_dict = {}
+        replace_dict['processid'] = self.get_process_name()
+        replace_dict['processid_uppercase'] = self.get_process_name().upper()
+        ff = open(pjoin(self.path, 'epoch_process_id.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - new method
+    def edit_colorsum(self):
+        """Generate color_sum.cc"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_colorsum')
+        template = open(pjoin(self.template_path,'gpu','color_sum.cc'),'r').read()
+        replace_dict = {}
+        # Extract color matrix again (this was also in get_matrix_single_process called within get_all_sigmaKin_lines)
+        replace_dict['color_matrix_lines'] = self.get_color_matrix_lines(self.matrix_elements[0])
+        ff = open(pjoin(self.path, 'color_sum.cc'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+        
+    def edit_processConfig(self):
+        """Generate process_config.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_processConfig')
+        template = open(pjoin(self.template_path,'gpu','processConfig.h'),'r').read()
+        replace_dict = {}
+        replace_dict['ndiagrams'] = len(self.matrix_elements[0].get('diagrams'))
+        replace_dict['processid_uppercase'] = self.get_process_name().upper()
+        ff = open(pjoin(self.path, 'processConfig.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    def generate_subprocess_directory_end(self, **opt):
+        """ opt contain all local variable of the fortran original function"""
+        if self.include_multi_channel:
+            #self.edit_coloramps() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
+            subproc_diagrams_for_config = opt['subproc_diagrams_for_config']
+            misc.sprint(len(subproc_diagrams_for_config))
+            self.edit_coloramps( subproc_diagrams_for_config)
+
+    # AV - new method
+    def edit_coloramps(self, config_subproc_map):
+        """Generate coloramps.h"""
+
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_coloramps')
+        template = open(pjoin(self.template_path,'gpu','coloramps.h'),'r').read()
+        ff = open(pjoin(self.path, 'coloramps.h'),'w')
+        # The following five lines from OneProcessExporterCPP.get_sigmaKin_lines (using OneProcessExporterCPP.get_icolamp_lines)
+        replace_dict={}
+
+
+        iconfig_to_diag = {}
+        diag_to_iconfig = {}
+        iconfig = 0 
+        for config in config_subproc_map:
+            if set(config) == set([0]):
+                continue
+            iconfig += 1
+            iconfig_to_diag[iconfig] = config[0] 
+            diag_to_iconfig[config[0]] = iconfig
+
+        misc.sprint(iconfig_to_diag)
+        misc.sprint(diag_to_iconfig)
+
+        # Note that if the last diagram is/are not mapped to a channel nb_diag 
+        # will be smaller than the true number of diagram. This is fine for color
+        # but maybe not for something else.
+        nb_diag = max(config[0] for config in config_subproc_map)
+        ndigits = str(int(math.log10(nb_diag))+1+1) # the additional +1 is for the -sign
+        # Output which diagrams correspond ot a channel to get information for valid color
+        lines = []
+        for diag in range(1, nb_diag+1):
+            channelidf = diag
+            channelidc = channelidf - 1 # C convention 
+            if diag in diag_to_iconfig:
+                iconfigf = diag_to_iconfig[diag]
+                iconfigftxt = '%i'%iconfigf
+            else:
+                iconfigf = -1
+                iconfigftxt = '-1 (diagram with no associated iconfig for single-diagram enhancement)'
+            text = '    %(iconfigf){0}i, // CHANNEL_ID=%(channelidf)-{0}i i.e. DIAGRAM=%(diag)-{0}i --> ICONFIG=%(iconfigftxt)s'.format(ndigits)
+            lines.append(text % {'diag':diag, 'channelidf':channelidf, 'iconfigf':iconfigf, 'iconfigftxt':iconfigftxt})
+        replace_dict['channelc2iconfig_lines'] = '\n'.join(lines)
+
+        if self.include_multi_channel: # NB unnecessary as edit_coloramps is not called otherwise...
+            subproc_to_confdiag = export_v4.ProcessExporterFortranMEGroup.get_confdiag_from_group_mapconfig(config_subproc_map, 0)             
+            replace_dict['is_LC'] = self.get_icolamp_lines(subproc_to_confdiag, self.matrix_elements[0], 1)
+            replace_dict['nb_channel'] = len(subproc_to_confdiag)
+            replace_dict['nb_diag'] = max(config[0] for config in config_subproc_map)
+            replace_dict['nb_color'] = max(1,len(self.matrix_elements[0].get('color_basis')))
+            
+            
+            # AV extra formatting (e.g. gg_tt was "{{true,true};,{true,false};,{false,true};};")
+            ###misc.sprint(replace_dict['is_LC'])
+            split = replace_dict['is_LC'].replace('{{','{').replace('};};','}').split(';,')
+            text=', // ICONFIG=%-{0}i <-- CHANNEL_ID=%i'.format(ndigits)
+            for iconfigc in range(len(split)): 
+                ###misc.sprint(split[iconfigc])
+                split[iconfigc] = '    ' + split[iconfigc].replace(',',', ').replace('true',' true').replace('{','{ ').replace('}',' }')
+                split[iconfigc] += text % (iconfigc+1, iconfig_to_diag[iconfigc+1])
+            replace_dict['is_LC'] = '\n'.join(split)
+            ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - new method
+    def edit_testxxx(self):
+        """Generate testxxx.cc"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_testxxx')
+        template = open(pjoin(self.template_path,'gpu','testxxx.cc'),'r').read()
+        replace_dict = {}
+        replace_dict['model_name'] = self.model_name
+        ff = open(pjoin(self.path, '..', 'testxxx.cc'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - new method
+    def edit_memorybuffers(self):
+        """Generate MemoryBuffers.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_memorybuffers')
+        template = open(pjoin(self.template_path,'gpu','MemoryBuffers.h'),'r').read()
+        replace_dict = {}
+        replace_dict['model_name'] = self.model_name
+        ff = open(pjoin(self.path, '..', 'MemoryBuffers.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - new method
+    def edit_memoryaccesscouplings(self):
+        """Generate MemoryAccessCouplings.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings')
+        template = open(pjoin(self.template_path,'gpu','MemoryAccessCouplings.h'),'r').read()
+        replace_dict = {}
+        replace_dict['model_name'] = self.model_name
+        ff = open(pjoin(self.path, '..', 'MemoryAccessCouplings.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - overload the export_cpp.OneProcessExporterGPU method (add debug printout and truncate last \n)
+    # [*NB export_cpp.UFOModelConverterGPU.write_process_h_file is not called!*]
+    def write_process_h_file(self, writer):
+        """Generate final CPPProcess.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.write_process_h_file')
+        out = super().write_process_h_file(writer)
+        writer.seek(-1, os.SEEK_CUR)
+        writer.truncate()
+        return out
+
+    # AV - replace the export_cpp.OneProcessExporterGPU method (replace HelAmps.cu by HelAmps.cc)
+    def super_write_process_cc_file(self, writer):
+        """Write the class member definition (.cc) file for the process described by matrix_element"""
+        replace_dict = super(PLUGIN_export_cpp.OneProcessExporterGPU, self).write_process_cc_file(False)
+        ###replace_dict['hel_amps_def'] = '\n#include \"../../src/HelAmps_%s.cu\"' % self.model_name
+        replace_dict['hel_amps_h'] = '#include \"HelAmps_%s.h\"' % self.model_name # AV
+        if writer:
+            file = self.read_template_file(self.process_template_cc) % replace_dict
+            # Write the file
+            writer.writelines(file)
+        else:
+            return replace_dict
+
+    # AV - overload the export_cpp.OneProcessExporterGPU method (add debug printout and truncate last \n)
+    def write_process_cc_file(self, writer):
+        """Generate CPPProcess.cc"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.write_process_cc_file')
+        ###out = super().write_process_cc_file(writer)
+        out = self.super_write_process_cc_file(writer)
+        writer.seek(-1, os.SEEK_CUR)
+        writer.truncate()
+        return out
+
+    # AV - replace the export_cpp.OneProcessExporterGPU method (improve formatting? actually keep all defaults!)
+    # [NB this is used in uu~>tt~ but not in gg>tt~ or e+e->mu+mu-, see issue #337]
+    @staticmethod
+    def coeff(ff_number, frac, is_imaginary, Nc_power, Nc_value=3):
+        """Returns a nicely formatted string for the coefficients in JAMP lines"""
+        total_coeff = ff_number * frac * Fraction(Nc_value) ** Nc_power
+        if total_coeff == 1:
+            if is_imaginary:
+                return '+cxtype(0,1)*' # AV keep default (this is not used in eemumu - should use cI eventually)
+            else:
+                return '+' # AV keep default (this is not used in eemumu)
+        elif total_coeff == -1:
+            if is_imaginary:
+                return '-cxtype(0,1)*' # AV keep default (this is not used in eemumu - should use cI eventually)
+            else:
+                return '-' # AV keep default (eg jamp_sv[0] += -amp_sv[0])
+        ###assert(False) # [this had been inserted to check if coeff is used at all, it is used in uu~>tt~, see #337]
+        res_str = '%+i.' % total_coeff.numerator
+        if total_coeff.denominator != 1:
+            # Check if total_coeff is an integer
+            res_str = res_str + '/%i.' % total_coeff.denominator
+        if is_imaginary:
+            res_str = res_str + '*cxtype(0,1)'
+        return res_str + '*' # AV keep default (this is not used in eemumu)
+
+    # AV - replace the export_cpp.OneProcessExporterCPP method (fix fptype and improve formatting)
+    def get_color_matrix_lines(self, matrix_element):
+        """Return the color matrix definition lines for this matrix element. Split rows in chunks of size n."""
+        import madgraph.core.color_algebra as color
+        if not matrix_element.get('color_matrix'):
+            return '\n'.join(['  static constexpr fptype2 colorDenom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};'])
+        else:
+            color_denominators = matrix_element.get('color_matrix').\
+                                                 get_line_denominators()
+            denom_string = '  static constexpr fptype2 colorDenom[ncolor] = { %s }; // 1-D array[%i]' \
+                           % ( ', '.join(['%i' % denom for denom in color_denominators]), len(color_denominators) )
+            matrix_strings = []
+            my_cs = color.ColorString()
+            for index, denominator in enumerate(color_denominators):
+                # Then write the numerators for the matrix elements
+                num_list = matrix_element.get('color_matrix').get_line_numerators(index, denominator)
+                matrix_strings.append('{ %s }' % ', '.join(['%d' % i for i in num_list]))
+            matrix_string = '  static constexpr fptype2 colorMatrix[ncolor][ncolor] = '
+            if len( matrix_strings ) > 1 : matrix_string += '{\n    ' + ',\n    '.join(matrix_strings) + ' };'
+            else: matrix_string += '{ ' + matrix_strings[0] + ' };'
+            matrix_string += ' // 2-D array[%i][%i]' % ( len(color_denominators), len(color_denominators) )
+            denom_comment = '\n  // The color denominators (initialize all array elements, with ncolor=%i)\n  // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
+            matrix_comment = '\n  // The color matrix (initialize all array elements, with ncolor=%i)\n  // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
+            denom_string = denom_comment + denom_string
+            matrix_string = matrix_comment + matrix_string
+            return '\n'.join([denom_string, matrix_string])
+
+    # AV - replace the export_cpp.OneProcessExporterGPU method (improve formatting)
+    def get_initProc_lines(self, matrix_element, color_amplitudes):
+        """Get initProc_lines for function definition for CPPProcess::initProc"""
+        initProc_lines = []
+        initProc_lines.append('// Set external particle masses for this matrix element')
+        for part in matrix_element.get_external_wavefunctions():
+            ###initProc_lines.append('mME.push_back(pars->%s);' % part.get('mass'))
+            initProc_lines.append('    m_masses.push_back( m_pars->%s );' % part.get('mass')) # AV
+        ###for i, colamp in enumerate(color_amplitudes):
+        ###    initProc_lines.append('jamp2_sv[%d] = new double[%d];' % (i, len(colamp))) # AV - this was commented out already
+        return '\n'.join(initProc_lines)
+
+    # AV - replace the export_cpp.OneProcessExporterCPP method (fix helicity order and improve formatting)
+    def get_helicity_matrix(self, matrix_element):
+        """Return the Helicity matrix definition lines for this matrix element"""
+        helicity_line = '    static constexpr short helicities[ncomb][npar] = {\n      '; # AV (this is tHel)
+        helicity_line_list = []
+        for helicities in matrix_element.get_helicity_matrix(allow_reverse=True): # AV was False: different order in Fortran and cudacpp! #569
+            helicity_line_list.append( '{ ' + ', '.join(['%d'] * len(helicities)) % tuple(helicities) + ' }' ) # AV
+        return helicity_line + ',\n      '.join(helicity_line_list) + ' };' # AV
+
+    def get_flavor_matrix(self, matrix_element):
+        """Return the flavor matrix definition lines for this matrix element"""
+        flavor_line = '    static constexpr short flavors[nmaxflavor][npar] = {\n      '; # (this is tFlavors)
+        flavor_line_list = []
+        for flavors in matrix_element.get_external_flavors_with_iden():
+            # get only the index 0 one because the other ones have same matrix element
+            # additionally they will be used as indices in some cases (e.g. matrix flavor couplings)
+            # so we need to subtract 1 because FORTRAN indices starts from 1, and C++ from zero
+            cpp_flavors = list(map(lambda f: f-1, flavors[0]))
+            flavor_line_list.append( '{ ' + ', '.join(['%d'] * len(cpp_flavors)) % tuple(cpp_flavors) + ' }' )
+        return flavor_line + ',\n      '.join(flavor_line_list) + ' };'
+
+    # AV - overload the export_cpp.OneProcessExporterGPU method (just to add some comments...)
+    def get_reset_jamp_lines(self, color_amplitudes):
+        """Get lines to reset jamps"""
+        ret_lines = super().get_reset_jamp_lines(color_amplitudes)
+        if ret_lines != '' : ret_lines = '    // Reset jamp (reset color flows)\n' + ret_lines # AV THIS SHOULD NEVER HAPPEN!
+        return ret_lines
+
+
+#------------------------------------------------------------------------------------
+
+import madgraph.core.helas_objects as helas_objects
+import madgraph.iolibs.helas_call_writers as helas_call_writers
+
+# AV - define a custom HelasCallWriter
+# (NB: enable this via PLUGIN_ProcessExporter.helas_exporter in output.py - this fixes #341)
+class PLUGIN_GPUFOHelasCallWriter(helas_call_writers.GPUFOHelasCallWriter):
+    """ A Custom HelasCallWriter """
+    # Class structure information
+    #  - object
+    #  - dict(object) [built-in]
+    #  - PhysicsObject(dict) [in madgraph/core/base_objects.py]
+    #  - HelasCallWriter(base_objects.PhysicsObject) [in madgraph/iolibs/helas_call_writers.py]
+    #  - UFOHelasCallWriter(HelasCallWriter) [in madgraph/iolibs/helas_call_writers.py]
+    #  - CPPUFOHelasCallWriter(UFOHelasCallWriter) [in madgraph/iolibs/helas_call_writers.py]
+    #  - GPUFOHelasCallWriter(CPPUFOHelasCallWriter) [in madgraph/iolibs/helas_call_writers.py]
+    #  - PLUGIN_GPUFOHelasCallWriter(GPUFOHelasCallWriter)
+    #      This class
+
+
+    def __init__(self, *args, **opts):
+
+        self.wanted_ordered_dep_couplings = []
+        self.wanted_ordered_indep_couplings = []
+        self.wanted_ordered_flv_couplings = []
+
+        self.flv_couplings_map = {}
+        super().__init__(*args,**opts)
+
+
+    # AV - replace helas_call_writers.GPUFOHelasCallWriter method (improve formatting of CPPProcess.cc)
+    # [GPUFOHelasCallWriter.format_coupling is called by GPUFOHelasCallWriter.get_external_line/generate_helas_call]
+    # [GPUFOHelasCallWriter.get_external_line is called by GPUFOHelasCallWriter.get_external]
+    # [GPUFOHelasCallWriter.get_external (adding #ifdef CUDA) is called by GPUFOHelasCallWriter.generate_helas_call]
+    # [GPUFOHelasCallWriter.generate_helas_call is called by UFOHelasCallWriter.get_wavefunction_call/get_amplitude_call]
+    ###findcoupling = re.compile('pars->([-]*[\d\w_]+)\s*,')
+    def format_coupling(self, call):
+        """Format the coupling so any minus signs are put in front"""
+        import re
+        ###print(call) # FOR DEBUGGING
+        model = self.get('model')
+        newcoup = False
+        if not hasattr(self, 'couplings2order'):
+            self.couplings2order = {}
+            self.params2order = {}
+        if not hasattr(self, 'couporderdep'):
+            self.couporderdep = {}
+            self.couporderindep = {}
+            self.couporderflv = {}
+        for coup in re.findall(self.findcoupling, call):
+            if coup == 'ZERO':
+                ###call = call.replace('pars->ZERO', '0.')
+                call = call.replace('m_pars->ZERO', '0.') # AV
+                continue
+            sign = ''
+            if coup.startswith('-'):
+                sign = '-'
+                coup = coup[1:]
+            try:
+                param = model.get_parameter(coup)
+            except KeyError:
+                param = False
+            if param:
+                alias = self.params2order
+                aliastxt = 'PARAM'
+                name = 'cIPD'
+            elif model.is_running_coupling(coup):
+                if coup not in self.wanted_ordered_dep_couplings: 
+                    self.wanted_ordered_dep_couplings.append(coup)
+                alias = self.couporderdep
+                aliastxt = 'COUPD'
+                name = 'cIPC'
+            elif coup.startswith("FLV"):
+                if coup not in [coup.name for coup in self.wanted_ordered_flv_couplings]:
+                    flv_coup = self.flv_couplings_map[coup]
+                    self.wanted_ordered_flv_couplings.append(flv_coup)
+                    for indep_coup in set(flv_coup.flavors.values()):
+                        if indep_coup not in self.wanted_ordered_indep_couplings:
+                            self.wanted_ordered_indep_couplings.append(indep_coup)
+                alias = self.couporderflv
+                aliastxt = 'flvCOUP'
+                name = 'flvCOUPs'
+            else:
+                if coup not in self.wanted_ordered_indep_couplings: 
+                    self.wanted_ordered_indep_couplings.append(coup)
+                alias = self.couporderindep
+                aliastxt = 'COUPI'
+                name = 'cIPC'
+            if coup not in alias:
+                ###if alias == self.couporderindep: # bug #821! this is incorrectly true when both dictionaries are empty!
+                if aliastxt == 'COUPI':
+                    if not len(alias):
+                        alias[coup] = len(self.couporderdep)
+                    else:
+                        alias[coup] = alias[list(alias)[-1]]+1
+                else:
+                    alias[coup] = len(alias) # this works perfectly also for FLV couplings
+                ###if alias == self.couporderdep: # bug #821! this is incorrectly true when both dictionaries are empty!
+                if aliastxt == 'COUPD':
+                    for k in self.couporderindep:
+                        self.couporderindep[k] += 1
+                newcoup = True
+            if name == 'cIPD':
+                call = call.replace('m_pars->%s%s' % (sign, coup),
+                                    '%s%s[%s]' % (sign, name, alias[coup]))                        
+            elif model.is_running_coupling(coup):
+                ###call = call.replace('m_pars->%s%s' % (sign, coup),
+                ###                    '%scxmake( cIPC[%s], cIPC[%s] )' %
+                ###                    (sign, 2*alias[coup],2*alias[coup]+1))
+                ###misc.sprint(name, alias[coup])
+                # AV from cIPCs to COUP array (running alphas #373)
+                # OM fix handling of 'unary minus' #628
+                call = call.replace('CI_ACCESS', 'CD_ACCESS')
+                call = call.replace('m_pars->%s%s' % (sign, coup),
+                                    'COUPs[%s], %s' % (alias[coup], '1.0' if not sign else '-1.0')) 
+            elif name == 'flvCOUPs':
+                call = call.replace('CD_ACCESS', 'CI_ACCESS')
+                call = call.replace('m_pars->%s%s' % (sign, coup),
+                                    '%s[%s], %s' % (name, alias[coup], '1.0' if not sign else '-1.0'))
+            else:
+                call = call.replace('CD_ACCESS', 'CI_ACCESS')
+                call = call.replace('m_pars->%s%s' % (sign, coup),
+                                    'COUPs[ndcoup + %s], %s' % (alias[coup]-len(self.couporderdep), '1.0' if not sign else '-1.0'))
+
+            if newcoup:
+                self.couplings2order = self.couporderdep | self.couporderindep
+        model.cudacpp_wanted_ordered_couplings = self.wanted_ordered_dep_couplings + self.wanted_ordered_indep_couplings + self.wanted_ordered_flv_couplings
+        return call
+
+    # AV - new method for formatting wavefunction/amplitude calls
+    # [It would be too complex to modify them in helas_objects.HelasWavefunction/Amplitude.get_call_key]
+    @staticmethod
+    def format_call(call):
+        return call.replace('(','( ').replace(')',' )').replace(',',', ')
+
+    # AV - replace helas_call_writers.GPUFOHelasCallWriter method (improve formatting)
+    def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi_channel_map=False):
+        """Return a list of strings, corresponding to the Helas calls for the matrix element"""
+        import madgraph.core.helas_objects as helas_objects
+        import madgraph.loop.loop_helas_objects as loop_helas_objects
+        assert isinstance(matrix_element, helas_objects.HelasMatrixElement), \
+               '%s not valid argument for get_matrix_element_calls' % \
+               type(matrix_element)
+        # Do not reuse the wavefunctions for loop matrix elements
+        if isinstance(matrix_element, loop_helas_objects.LoopHelasMatrixElement):
+            return self.get_loop_matrix_element_calls(matrix_element)
+        # Restructure data for easier handling
+        color = {}
+        for njamp, coeff_list in enumerate(color_amplitudes):
+            for coeff, namp in coeff_list:
+                if namp not in color:
+                    color[namp] = {}
+                color[namp][njamp] = coeff
+        me = matrix_element.get('diagrams')
+        matrix_element.reuse_outdated_wavefunctions(me)
+        ###misc.sprint(multi_channel_map)
+        res = []
+        ###res.append('for(int i=0;i<%s;i++){jamp[i] = cxtype(0.,0.);}' % len(color_amplitudes))
+        res.append("""//constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
+      const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
+      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+      // CUDA kernels take input/output buffers with momenta/MEs for all events
+      const fptype* momenta = allmomenta;
+      const fptype* COUPs[nxcoup];
+      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
+      fptype* denominators = allDenominators;
+#endif
+#else
+      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      const fptype* COUPs[nxcoup];
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
+      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
+#endif
+      // Create an array of views over the Flavor Couplings
+      FLV_COUPLING_VIEW flvCOUPs[nIPF];
+      for ( int idflv = 0; idflv < nIPF; idflv++ )
+        flvCOUPs[idflv] = FLV_COUPLING_VIEW{ cIPF_partner1, cIPF_partner2, cIPF_value, idflv * nMF };
+
+      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
+      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+      fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
+      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#endif
+      // Scalar iflavor for the current event
+      // for GPU it is an int
+      // for SIMD it is also an int, since it is constant across the SIMD vector
+      const uint_sv iflavor_sv = F_ACCESS::kernelAccessConst( iflavorVec );
+#ifdef MGONGPUCPP_GPUIMPL
+      const unsigned int iflavor = iflavor_sv;
+#else
+      const unsigned int iflavor = reinterpret_cast<const unsigned int*>(&iflavor_sv)[0];
+#endif
+""")
+        diagrams = matrix_element.get('diagrams')
+        diag_to_config = {}
+        if multi_channel_map:
+            for config in sorted(multi_channel_map.keys()):
+                amp = [a.get('number') for a in \
+                                  sum([diagrams[idiag].get('amplitudes') for \
+                                       idiag in multi_channel_map[config]], [])]
+                diag_to_config[amp[0]] = config
+        ###misc.sprint(diag_to_config)
+        id_amp = 0
+        for diagram in matrix_element.get('diagrams'):
+            ###print('DIAGRAM %3d: #wavefunctions=%3d, #diagrams=%3d' %
+            ###      (diagram.get('number'), len(diagram.get('wavefunctions')), len(diagram.get('amplitudes')) )) # AV - FOR DEBUGGING
+            res.append('\n      // *** DIAGRAM %d OF %d ***' % (diagram.get('number'), len(matrix_element.get('diagrams'))) ) # AV
+            res.append('\n      // Wavefunction(s) for diagram number %d' % diagram.get('number')) # AV
+            res.extend([ self.get_wavefunction_call(wf) for wf in diagram.get('wavefunctions') ]) # AV new: avoid format_call
+            if len(diagram.get('wavefunctions')) == 0 : res.append('// (none)') # AV
+            if res[-1][-1] == '\n' : res[-1] = res[-1][:-1]
+            res.append('\n      // Amplitude(s) for diagram number %d' % diagram.get('number'))
+            for amplitude in diagram.get('amplitudes'):
+                id_amp +=1
+                namp = amplitude.get('number')
+                amplitude.set('number', 1)
+                res.append(self.get_amplitude_call(amplitude)) # AV new: avoid format_call
+                if multi_channel_map: # different code bases #473 (assume this is the same as self.include_multi_channel...)
+                    if id_amp in diag_to_config:
+                        ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472
+                        ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472
+                        res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
+                        diagnum = diagram.get('number')
+                        res.append("if( storeChannelWeights )")
+                        res.append("{")
+                        res.append("  numerators_sv[%i] += cxabs2( amp_sv[0] );" % (diagnum-1))
+                        res.append("  denominators_sv += cxabs2( amp_sv[0] );")
+                        res.append("}")
+                        res.append("#endif")
+                else:
+                    res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
+                    res.append("// Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)")
+                    res.append("#endif")
+                for njamp, coeff in color[namp].items():
+                    scoeff = PLUGIN_OneProcessExporter.coeff(*coeff) # AV
+                    if scoeff[0] == '+' : scoeff = scoeff[1:]
+                    scoeff = scoeff.replace('(','( ')
+                    scoeff = scoeff.replace(')',' )')
+                    scoeff = scoeff.replace(',',', ')
+                    scoeff = scoeff.replace('*',' * ')
+                    scoeff = scoeff.replace('/',' / ')
+                    if scoeff.startswith('-'): res.append('jamp_sv[%s] -= %samp_sv[0];' % (njamp, scoeff[1:])) # AV
+                    else: res.append('jamp_sv[%s] += %samp_sv[0];' % (njamp, scoeff)) # AV
+            if len(diagram.get('amplitudes')) == 0 : res.append('// (none)') # AV
+        ###res.append('\n    // *** END OF DIAGRAMS ***' ) # AV - no longer needed ('COLOR MATRIX BELOW')
+        return res
+
+    # AV - overload helas_call_writers.GPUFOHelasCallWriter method (improve formatting)
+    def get_matrix_element_calls(self, matrix_element, color_amplitudes, multi_channel_map=False):
+        """Return a list of strings, corresponding to the Helas calls for the matrix element"""
+        res = self.super_get_matrix_element_calls(matrix_element, color_amplitudes, multi_channel_map)
+        for i, item in enumerate(res):
+            ###print(item) # FOR DEBUGGING
+            if item.startswith('# Amplitude'): item='//'+item[1:] # AV replace '# Amplitude' by '// Amplitude'
+            if not item.startswith('\n') and not item.startswith('#'): res[i]='      '+item
+        return res
+
+    # AV - replace helas_call_writers.GPUFOHelasCallWriter method (improve formatting)
+    # [GPUFOHelasCallWriter.format_coupling is called by GPUFOHelasCallWriter.get_external_line/generate_helas_call]
+    # [GPUFOHelasCallWriter.get_external_line is called by GPUFOHelasCallWriter.get_external]
+    # [=> GPUFOHelasCallWriter.get_external is called by GPUFOHelasCallWriter.generate_helas_call]
+    # [GPUFOHelasCallWriter.generate_helas_call is called by UFOHelasCallWriter.get_wavefunction_call/get_amplitude_call]
+    first_get_external = True
+    def get_external(self, wf, argument):
+        line = self.get_external_line(wf, argument)
+        split_line = line.split(',')
+        split_line = [ str.lstrip(' ').rstrip(' ') for str in split_line] # AV
+        # (AV join using ',': no need to add a space as this is done by format_call later on)
+        line = ', '.join(split_line)
+        line = line.replace( 'xxx(', 'xxx<M_ACCESS, W_ACCESS>(' )
+        line = line.replace( 'w_sv', 'w_fp' )
+        # AV2: line2 logic is to have MGONGPU_TEST_DIVERGENCE on the first xxx call
+        if self.first_get_external and ( ( 'mzxxx' in line ) or ( 'pzxxx' in line ) or ( 'xzxxx' in line ) ) :
+            self.first_get_external = False
+            line2 = line.replace('mzxxx','xxxxx').replace('pzxxx','xxxxx').replace('xzxxx','xxxxx')
+            line2 = line2[:line2.find('// NB')]
+            split_line2 = line2.split(',')
+            split_line2 = [ str.lstrip(' ').rstrip(' ') for str in split_line2] # AV
+            split_line2.insert(2, '0') # add parameter fmass=0
+            line2 = ', '.join(split_line2)
+            text = '#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )\n      %s\n#else\n      if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n        %s\n      else\n        %s\n#endif\n' # AV
+            return text % (line, line, line2)
+        text = '%s\n' # AV
+        return text % line
+
+    # AV - replace helas_call_writers.GPUFOHelasCallWriter method (vectorize w_sv)
+    # This is the method that creates the ixxx/oxxx function calls in calculate_wavefunctions
+    # [GPUFOHelasCallWriter.get_external_line is called by GPUFOHelasCallWriter.get_external]
+    # [GPUFOHelasCallWriter.get_external (adding #ifdef CUDA) is called by GPUFOHelasCallWriter.generate_helas_call]
+    # [GPUFOHelasCallWriter.generate_helas_call is called by UFOHelasCallWriter.get_wavefunction_call/get_amplitude_call]
+    def get_external_line(self, wf, argument):
+        call = ''
+        call = call + helas_call_writers.HelasCallWriter.mother_dict[\
+            argument.get_spin_state_number()].lower()
+        # Fill out with X up to 6 positions
+        call = call + 'x' * (6 - len(call))
+        # Specify namespace for Helas calls
+        call = call + '( momenta,'
+        if argument.get('spin') != 1:
+            # For non-scalars, need mass and helicity
+            call = call + 'm_pars->%s, cHel[ihel][%d],'
+        else:
+            # AV This seems to be for scalars (spin==1???), pass neither mass nor helicity (#351)
+            ###call = call + 'm_pars->%s,'
+            call = call
+        # Add flavor and the related ALOHA object
+        call = call + '%+d, cFlavors[iflavor][%d], aloha_obj[%d], %d );'
+        if argument.get('spin') == 1:
+            # AV This seems to be for scalars (spin==1???), pass neither mass nor helicity (#351)
+            return call % \
+                            (
+                                ###wf.get('mass'),
+                                # For boson, need initial/final here
+                                (-1) ** (wf.get('state') == 'initial'),
+                                wf.get('me_id')-1,
+                                wf.get('number_external')-1,
+                                wf.get('number_external')-1)
+        elif argument.is_boson():
+            ###misc.sprint(call)
+            ###misc.sprint( (wf.get('mass'),
+            ###                     wf.get('number_external')-1,
+            ###                     # For boson, need initial/final here
+            ###                     (-1) ** (wf.get('state') == 'initial'),
+            ###                     wf.get('me_id')-1,
+            ###                     wf.get('number_external')-1))
+            return  self.format_coupling(call % \
+                            (wf.get('mass'),
+                                wf.get('number_external')-1,
+                                # For boson, need initial/final here
+                                (-1) ** (wf.get('state') == 'initial'),
+                                wf.get('number_external')-1,
+                                wf.get('me_id')-1,
+                                wf.get('number_external')-1))
+        else:
+            return self.format_coupling(call % \
+                            (wf.get('mass'),
+                                wf.get('number_external')-1,
+                                # For fermions, need particle/antiparticle
+                                - (-1) ** wf.get_with_flow('is_part'),
+                                wf.get('number_external')-1,
+                                wf.get('me_id')-1,
+                                wf.get('number_external')-1))
+
+    # AV - replace helas_call_writers.GPUFOHelasCallWriter method (vectorize w_sv and amp_sv)
+    def generate_helas_call(self, argument):
+        """Routine for automatic generation of C++ Helas calls
+        according to just the spin structure of the interaction.
+
+        First the call string is generated, using a dictionary to go
+        from the spin state of the calling wavefunction and its
+        mothers, or the mothers of the amplitude, to difenrentiate wich call is
+        done.
+
+        Then the call function is generated, as a lambda which fills
+        the call string with the information of the calling
+        wavefunction or amplitude. The call has different structure,
+        depending on the spin of the wavefunction and the number of
+        mothers (multiplicity of the vertex). The mother
+        wavefunctions, when entering the call, must be sorted in the
+        correct way - this is done by the sorted_mothers routine.
+
+        Finally the call function is stored in the relevant
+        dictionary, in order to be able to reuse the function the next
+        time a wavefunction with the same Lorentz structure is needed.
+        """
+        if not isinstance(argument, helas_objects.HelasWavefunction) and \
+           not isinstance(argument, helas_objects.HelasAmplitude):
+            raise self.PhysicsObjectError('get_helas_call must be called with wavefunction or amplitude')
+        call = ''
+        call_function = None
+        if isinstance(argument, helas_objects.HelasAmplitude) and \
+           argument.get('interaction_id') == 0:
+            call = '#'
+            call_function = lambda amp: call
+            self.add_amplitude(argument.get_call_key(), call_function)
+            return
+        if isinstance(argument, helas_objects.HelasWavefunction) and \
+               not argument.get('mothers'):
+            # String is just ixxxxx, oxxxxx, vxxxxx or sxxxxx
+            call_function = lambda wf: self.get_external(wf, argument)
+        else:
+            if isinstance(argument, helas_objects.HelasWavefunction):
+                outgoing = argument.find_outgoing_number()
+            else:
+                outgoing = 0
+            # Check if we need to append a charge conjugation flag
+            l = [str(l) for l in argument.get('lorentz')]
+            flag = []
+            if argument.needs_hermitian_conjugate():
+                flag = ['C%d' % i for i in argument.get_conjugate_index()]
+            # Creating line formatting:
+            # (AV NB: in the default code these two branches were identical, use a single branch)
+            ###if isinstance(argument, helas_objects.HelasWavefunction): # AV e.g. FFV1P0_3 (output is wavefunction)
+            ###    call = '%(routine_name)s(%(wf)s%(coup)s%(mass)s%(out)s);'
+            ###else: # AV e.g. FFV1_0 (output is amplitude)
+            ###    call = '%(routine_name)s(%(wf)s%(coup)s%(mass)s%(out)s);'
+            call = '%(routine_name)s( %(wf)s%(coup)s%(mass)s%(out)s );'
+            # compute wf
+            arg = {'routine_name': aloha_writers.combine_name('%s' % l[0], l[1:], outgoing, flag, True),
+                   'wf': ('aloha_obj[%%(%d)d], ' * len(argument.get('mothers'))) % tuple(range(len(argument.get('mothers')))),
+                   'coup': ('m_pars->%%(coup%d)s, ' * len(argument.get('coupling'))) % tuple(range(len(argument.get('coupling'))))
+                   }
+            # AV FOR PR #434: determine if this call needs aS-dependent or aS-independent parameters
+            usesdepcoupl = None
+            for coup in argument.get('coupling'):
+                if isinstance(coup, base_objects.FLV_Coupling):
+                    if usesdepcoupl is None: usesdepcoupl = False
+                    elif usesdepcoupl: raise Exception('PANIC! this call seems to use both aS-dependent and aS-independent couplings?')
+                    self.flv_couplings_map[coup.name] = coup
+                    continue
+                if coup.startswith('-'): 
+                    coup = coup[1:]
+                # Use the same implementation as in UFOModelConverterCPP.prepare_couplings (assume self.model is the same)
+                for key, coup_list in self.get('model')['couplings'].items():
+                    if coup in coup_list:
+                        if "aS" in key:
+                            if usesdepcoupl is None: usesdepcoupl = True
+                            elif not usesdepcoupl: raise Exception('PANIC! this call seems to use both aS-dependent and aS-independent couplings?')
+                        else:
+                            if usesdepcoupl is None: usesdepcoupl = False
+                            elif usesdepcoupl: raise Exception('PANIC! this call seems to use both aS-dependent and aS-independent couplings?')
+            # AV FOR PR #434: CI_ACCESS for independent couplings and CD_ACCESS for dependent couplings
+            if usesdepcoupl is None: raise Exception('PANIC! could not determine if this call uses aS-dependent or aS-independent couplings?')
+            elif usesdepcoupl: caccess = 'CD_ACCESS'
+            else: caccess = 'CI_ACCESS'
+            ###if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '<W_ACCESS, A_ACCESS, C_ACCESS>'
+            ###else : arg['routine_name'] += '<W_ACCESS, C_ACCESS>'
+            if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '<W_ACCESS, A_ACCESS, %s>'%caccess
+            else : arg['routine_name'] += '<W_ACCESS, %s>'%caccess
+            if isinstance(argument, helas_objects.HelasWavefunction):
+                #arg['out'] = 'w_sv[%(out)d]'
+                arg['out'] = 'aloha_obj[%(out)d]'
+                if aloha.complex_mass:
+                    arg['mass'] = 'm_pars->%(CM)s, '
+                else:
+                    arg['mass'] = 'm_pars->%(M)s, m_pars->%(W)s, '
+            else:
+                #arg['out'] = '&amp_sv[%(out)d]'
+                arg['out'] = '&amp_fp[%(out)d]'
+                arg['out2'] = 'amp_sv[%(out)d]'
+                arg['mass'] = ''
+            call = call % arg
+            # Now we have a line correctly formatted
+            call_function = lambda wf: self.format_coupling(
+                                         call % wf.get_helas_call_dict(index=0))
+        # Add the constructed function to wavefunction or amplitude dictionary
+        if isinstance(argument, helas_objects.HelasWavefunction):
+            self.add_wavefunction(argument.get_call_key(), call_function)
+        else:
+            self.add_amplitude(argument.get_call_key(), call_function)
+
+#------------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/output.py b/PLUGIN/CUDACPP_OUTPUT/output.py
new file mode 100644
index 0000000000..1f76172cda
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/output.py
@@ -0,0 +1,528 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2024) for the MG5aMC CUDACPP plugin.
+
+import os
+import sys
+import subprocess
+
+# AV - PLUGIN_NAME can be one of PLUGIN/CUDACPP_OUTPUT or MG5aMC_PLUGIN/CUDACPP_OUTPUT
+PLUGIN_NAME = __name__.rsplit('.',1)[0]
+
+# AV - load an independent 2nd copy of the export_cpp module (as PLUGIN_export_cpp) and use that within the plugin (workaround for #341)
+# See https://stackoverflow.com/a/11285504
+###import madgraph.iolibs.export_cpp as export_cpp # 1st copy
+######import madgraph.iolibs.export_cpp as PLUGIN_export_cpp # this is not enough to define an independent 2nd copy: id(export_cpp)==id(PLUGIN_export_cpp)
+import importlib.util
+SPEC_EXPORTCPP = importlib.util.find_spec('madgraph.iolibs.export_cpp')
+PLUGIN_export_cpp = importlib.util.module_from_spec(SPEC_EXPORTCPP)
+SPEC_EXPORTCPP.loader.exec_module(PLUGIN_export_cpp)
+###sys.modules['PLUGIN.CUDACPP_OUTPUT.PLUGIN_export_cpp'] = PLUGIN_export_cpp # allow 'import PLUGIN.CUDACPP_OUTPUT.PLUGIN_export_cpp' in model_handling.py
+sys.modules['%s.PLUGIN_export_cpp'%PLUGIN_NAME] = PLUGIN_export_cpp # allow 'import <PLUGIN_NAME>.PLUGIN_export_cpp' in model_handling.py
+del SPEC_EXPORTCPP
+###print('id(export_cpp)=%s'%id(export_cpp))
+###print('id(PLUGIN_export_cpp)=%s'%id(PLUGIN_export_cpp))
+
+# AV - use template files from PLUGINDIR instead of MG5DIR
+###from madgraph import MG5DIR
+PLUGINDIR = os.path.dirname( __file__ )
+
+# AV - model_handling includes the custom FileWriter, ALOHAWriter, UFOModelConverter, OneProcessExporter and HelasCallWriter, plus additional patches
+###import PLUGIN.CUDACPP_OUTPUT.model_handling as model_handling # AV modify this to also allow MG5aMC_PLUGIN
+__import__('%s.model_handling'%PLUGIN_NAME)
+model_handling = sys.modules['%s.model_handling'%PLUGIN_NAME]
+
+# AV - create a plugin-specific logger
+import logging
+logger = logging.getLogger('madgraph.%s.output'%PLUGIN_NAME)
+from madgraph import MG5DIR
+#------------------------------------------------------------------------------------
+
+from os.path import join as pjoin
+import madgraph.iolibs.files as files
+import madgraph.iolibs.export_v4 as export_v4
+import madgraph.various.misc as misc
+
+from . import launch_plugin
+
+
+# AV - define the plugin's process exporter
+# (NB: this is the plugin's main class, enabled in the new_output dictionary in __init__.py)
+class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterCPP):
+    # Class structure information
+    #  - object
+    #  - VirtualExporter(object) [in madgraph/iolibs/export_v4.py]
+    #  - ProcessExporterCPP(VirtualExporter) [in madgraph/iolibs/export_cpp.py]
+    #  - PLUGIN_ProcessExporter(ProcessExporterCPP)
+    #      This class
+
+    # Below are the class variable that are defined in export_v4.VirtualExporter
+    # AV - keep defaults from export_v4.VirtualExporter
+    # Check status of the directory. Remove it if already exists
+    ###check = True
+    # Output type: [Template/dir/None] copy the Template (via copy_template), just create dir or do nothing
+    ###output = 'Template'
+
+    # If sa_symmetry is true, generate fewer matrix elements
+    # AV - keep OM's default for this plugin (using grouped_mode=False, "can decide to merge uu~ and u~u anyway")
+    sa_symmetry = True
+
+    # Below are the class variable that are defined in export_cpp.ProcessExporterGPU
+    # AV - keep defaults from export_cpp.ProcessExporterGPU
+    # Decide which type of merging is used [madevent/madweight]
+    grouped_mode = False
+    # Other options
+    default_opt = {'clean': False, 'complex_mass':False, 'export_format':'madevent', 'mp': False, 'v5_model': True }
+
+    # AV - keep defaults from export_cpp.ProcessExporterGPU
+    # AV - used in MadGraphCmd.do_output to assign export_cpp.ExportCPPFactory to MadGraphCmd._curr_exporter (if cpp or gpu)
+    # AV - used in MadGraphCmd.export to assign helas_call_writers.(CPPUFO|GPUFO)HelasCallWriter to MadGraphCmd._curr_helas_model (if cpp or gpu)
+    # Language type: 'v4' for f77, 'cpp' for C++ output
+    exporter = 'gpu'
+
+    # AV - use a custom OneProcessExporter
+    ###oneprocessclass = PLUGIN_export_cpp.OneProcessExporterGPU # responsible for P directory
+    oneprocessclass = model_handling.PLUGIN_OneProcessExporter
+
+    # Information to find the template file that we want to include from madgraph
+    # you can include additional file from the plugin directory as well
+    # AV - use template files from PLUGINDIR instead of MG5DIR and add gpu/mgOnGpuVectors.h
+    # [NB: mgOnGpuConfig.h, check_sa.cc and fcheck_sa.f are handled through dedicated methods]
+    ###s = MG5DIR + '/madgraph/iolibs/template_files/'
+    s = PLUGINDIR + '/madgraph/iolibs/template_files/'
+    from_template = {'.': [s+'.clang-format', s+'CMake/CMakeLists.txt',
+                           s+'COPYRIGHT', s+'COPYING', s+'COPYING.LESSER' ],
+                     'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
+                     'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
+                             s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
+                             s+'gpu/constexpr_math.h',
+                             s+'gpu/cudacpp_config.mk',
+                             s+'CMake/src/CMakeLists.txt' ],
+                     'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
+                                      s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
+                                      s+'gpu/color_sum.h',
+                                      s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',
+                                      s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h',
+                                      s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h',
+                                      s+'gpu/MemoryAccessAmplitudes.h', s+'gpu/MemoryAccessWavefunctions.h',
+                                      s+'gpu/MemoryAccessGs.h', s+'gpu/MemoryAccessCouplingsFixed.h',
+                                      s+'gpu/MemoryAccessNumerators.h', s+'gpu/MemoryAccessDenominators.h',
+                                      s+'gpu/MemoryAccessChannelIds.h', s+'gpu/MemoryAccessIflavorVec.h',
+                                      s+'gpu/EventStatistics.h', s+'gpu/CommonRandomNumbers.h',
+                                      s+'gpu/CrossSectionKernels.cc', s+'gpu/CrossSectionKernels.h',
+                                      s+'gpu/MatrixElementKernels.cc', s+'gpu/MatrixElementKernels.h',
+                                      s+'gpu/RamboSamplingKernels.cc', s+'gpu/RamboSamplingKernels.h',
+                                      s+'gpu/RandomNumberKernels.h', s+'gpu/CommonRandomNumberKernel.cc',
+                                      s+'gpu/CurandRandomNumberKernel.cc', s+'gpu/HiprandRandomNumberKernel.cc',
+                                      s+'gpu/Bridge.h', s+'gpu/BridgeKernels.cc', s+'gpu/BridgeKernels.h',
+                                      s+'gpu/fbridge.cc', s+'gpu/fbridge.h', s+'gpu/fbridge.inc', s+'gpu/fsampler.cc', s+'gpu/fsampler.inc',
+                                      s+'gpu/MadgraphTest.h', s+'gpu/runTest.cc',
+                                      s+'gpu/testmisc.cc', s+'gpu/testxxx_cc_ref.txt', s+'gpu/valgrind.h',
+                                      s+'gpu/perf.py', s+'gpu/profile.sh',
+                                      s+'gpu/cudacpp_overlay.mk', s+'gpu/makefile_wrapper.mk',
+                                      s+'gpu/umami.h', s+'gpu/umami.cc',
+                                      s+'CMake/SubProcesses/CMakeLists.txt'],
+                     'test': [s+'gpu/cudacpp_test.mk']}
+
+    to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
+                    'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
+                    'color_sum.h',
+                    'MemoryAccessHelpers.h', 'MemoryAccessVectors.h',
+                    'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h',
+                    'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h',
+                    'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h',
+                    'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h',
+                    'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h',
+                    'MemoryAccessChannelIds.h', 'MemoryAccessIflavorVec.h',
+                    'EventStatistics.h', 'CommonRandomNumbers.h',
+                    'CrossSectionKernels.cc', 'CrossSectionKernels.h',
+                    'MatrixElementKernels.cc', 'MatrixElementKernels.h',
+                    'RamboSamplingKernels.cc', 'RamboSamplingKernels.h',
+                    'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc',
+                    'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc',
+                    'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h',
+                    'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc',
+                    'MadgraphTest.h', 'runTest.cc',
+                    'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h',
+                    'cudacpp.mk', # this is generated from a template in Subprocesses but we still link it in P1
+                    'cudacpp_overlay.mk', # this is generated from a template in Subprocesses but we still link it in P1
+                    'testxxx.cc', # this is generated from a template in Subprocesses but we still link it in P1
+                    'MemoryBuffers.h', # this is generated from a template in Subprocesses but we still link it in P1
+                    'MemoryAccessCouplings.h', # this is generated from a template in Subprocesses but we still link it in P1
+                    'umami.h', 'umami.cc',
+                    'perf.py', 'profile.sh']
+
+    # AV - use template files from PLUGINDIR instead of MG5DIR and change their names
+    ###template_src_make = pjoin(MG5DIR, 'madgraph' ,'iolibs', 'template_files','gpu','Makefile_src')
+    ###template_Sub_make = pjoin(MG5DIR, 'madgraph', 'iolibs', 'template_files','gpu','Makefile')
+    template_src_make = pjoin(PLUGINDIR, 'madgraph' ,'iolibs', 'template_files','gpu','cudacpp_src.mk')
+    template_Sub_make = pjoin(PLUGINDIR, 'madgraph', 'iolibs', 'template_files','gpu','cudacpp.mk')
+    template_tst_make = pjoin(PLUGINDIR, 'madgraph', 'iolibs', 'template_files','gpu','cudacpp_test.mk')
+
+    # AV - use a custom UFOModelConverter (model/aloha exporter)
+    ###create_model_class =  PLUGIN_export_cpp.UFOModelConverterGPU
+    create_model_class = model_handling.PLUGIN_UFOModelConverter
+
+    # AV - use a custom GPUFOHelasCallWriter
+    # (NB: use "helas_exporter" - see class MadGraphCmd in madgraph_interface.py - not "aloha_exporter" that is never used!)
+    ###helas_exporter = None
+    helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341!
+
+    # AV (default from OM's tutorial) - add a debug printout
+    def __init__(self, *args, **kwargs):
+        self.in_madevent_mode = False # see MR #747
+        misc.sprint('Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter)')
+        return super().__init__(*args, **kwargs)
+
+    # AV - overload the default version: create CMake directory, do not create lib directory
+    def copy_template(self, model):
+        misc.sprint('Entering PLUGIN_ProcessExporter.copy_template (initialise the directory)')
+        try: os.mkdir(self.dir_path)
+        except os.error as error: logger.warning(error.strerror + ' ' + self.dir_path)
+        with misc.chdir(self.dir_path):
+            logger.info('Creating subdirectories in directory %s' % self.dir_path)
+            for d in ['src', 'Cards', 'SubProcesses', 'CMake', 'test', 'test/ref']: # AV - added CMake, test, test/ref; removed lib
+                try: os.mkdir(d)
+                except os.error as error: logger.warning(error.strerror + ' ' + os.path.join(self.dir_path,d))
+            # Write param_card
+            open(os.path.join('Cards','param_card.dat'), 'w').write(model.write_param_card())
+            # Copy files in various subdirectories
+            for key in self.from_template:
+                for f in self.from_template[key]:
+                    PLUGIN_export_cpp.cp(f, key) # NB this assumes directory key exists...
+            # Copy src makefile
+            if self.template_src_make:
+                makefile_src = self.read_template_file(self.template_src_make) % {'model': self.get_model_name(model.get('name'))}
+                open(os.path.join('src', 'cudacpp_src.mk'), 'w').write(makefile_src)
+            # Copy SubProcesses makefile
+            if self.template_Sub_make:
+                makefile = self.read_template_file(self.template_Sub_make) % {'model': self.get_model_name(model.get('name'))}
+                open(os.path.join('SubProcesses', 'cudacpp.mk'), 'w').write(makefile)
+            # Copy test makefile
+            if self.template_tst_make:
+                makefile_test = self.read_template_file(self.template_tst_make) % {'model': self.get_model_name(model.get('name'))}
+                open(os.path.join('test', 'cudacpp_test.mk'), 'w').write(makefile_test)
+
+    # OM - overload export_v4.py version to add additional_clean section (and avoid patchMad.sh for Source/makefile)
+    def write_source_makefile(self, writer, model=None, default=None):
+        if default:
+            replace_dict = default
+        else:
+            raise Exception('primary exporter should have been run first')
+        path = pjoin(PLUGINDIR , 'madgraph', 'iolibs', 'template_files', 'madevent_makefile_source_addon')
+        replace_dict['additional_clean'] += open(path).read()
+        if writer:
+            path = pjoin(MG5DIR, 'madgraph', 'iolibs','template_files','madevent_makefile_source')
+            text = open(path).read() % replace_dict
+            writer.write(text)
+
+    # AV - add debug printouts (in addition to the default one from OM's tutorial)
+    def generate_subprocess_directory(self, subproc_group, fortran_model, me=None):
+        misc.sprint('Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory)')
+        misc.sprint('  type(subproc_group)=%s'%type(subproc_group)) # e.g. madgraph.core.helas_objects.HelasMatrixElement
+        misc.sprint('  type(fortran_model)=%s'%type(fortran_model)) # e.g. madgraph.iolibs.helas_call_writers.GPUFOHelasCallWriter
+        misc.sprint('  type(me)=%s me=%s'%(type(me) if me is not None else None, me)) # e.g. int
+        misc.sprint("need to link", self.to_link_in_P)
+        out = super().generate_subprocess_directory(subproc_group, fortran_model, me)
+        return out
+    # AV (default from OM's tutorial) - add a debug printout
+    def convert_model(self, model, wanted_lorentz=[], wanted_couplings=[]):
+        if hasattr(model , 'cudacpp_wanted_ordered_couplings'):
+            wanted_couplings = model.cudacpp_wanted_ordered_couplings
+            del model.cudacpp_wanted_ordered_couplings
+        return super().convert_model(model, wanted_lorentz, wanted_couplings)
+
+    # AV (default from OM's tutorial) - add a debug printout
+    def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
+        """Typically creating jpeg/HTML output/ compilation/...
+            cmdhistory is the list of command used so far.
+            MG5options are all the options of the main interface
+            outputflags is a list of options provided when doing the output command"""
+        ###misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
+        if self.in_madevent_mode:
+            # Modify makefiles and symlinks to avoid doing
+            # make -f makefile -f cudacpp_overlay.mk to include the overlay
+            # and instead just use `make`, see #1052
+            subprocesses_dir = pjoin(self.dir_path, "SubProcesses")
+            files.cp(pjoin(subprocesses_dir, "makefile"), pjoin(subprocesses_dir, "makefile_original.mk"))
+            files.rm(pjoin(subprocesses_dir, "makefile"))
+            files.ln(pjoin(subprocesses_dir, "makefile_wrapper.mk"), subprocesses_dir, 'makefile')
+
+            patch_coupl_write = r"""set -euo pipefail
+# Get last fields from lines starting with WRITE(*,2)
+gcs=$(awk '$1=="WRITE(*,2)" {print $NF}' coupl_write.inc)
+
+for gc in $gcs; do
+  if grep -q "$gc(VECSIZE_MEMMAX)" coupl.inc; then
+    awk -v gc="$gc" '{
+      if ($1=="WRITE(*,2)" && $NF==gc) print $0"(1)";
+      else print
+    }' coupl_write.inc > coupl_write.inc.new
+    mv coupl_write.inc.new coupl_write.inc
+  fi
+done"""
+            try:
+                result = subprocess.run(
+                    ["bash", "-c", patch_coupl_write],
+                    cwd=pjoin(self.dir_path, "Source", "MODEL"),
+                    text=True,
+                    capture_output=True,
+                    check=True,  # raise CalledProcessError on non-zero exit
+                )
+                misc.sprint(result.returncode)
+            except subprocess.CalledProcessError as e:
+                logger.debug("####### \n stdout is \n %s", e.stdout)
+                logger.info("####### \n stderr is \n %s", e.stderr)
+                logger.info("return code is %s\n", e.returncode)
+                raise Exception("ERROR while patching coupl_write.inc") from e
+
+            # Additional patching (OM)
+            self.add_madevent_plugin_fct() # Added by OM
+        # do not call standard finalize since is this is already done...
+        #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
+
+    # AV (default from OM's tutorial) - overload settings and add a debug printout
+    def modify_grouping(self, matrix_element):
+        """allow to modify the grouping (if grouping is in place)
+            return two value:
+            - True/False if the matrix_element was modified
+            - the new(or old) matrix element"""
+        # Irrelevant here since group_mode=False so this function is never called
+        misc.sprint('Entering PLUGIN_ProcessExporter.modify_grouping')
+        return False, matrix_element
+
+    # OM adding a new way to "patch" python file such that the launch command of MG5aMC is working
+    # this consist in a file plugin_interface.py
+    # which contains a series of functions and one dictionary variable TO_OVERWRITE
+    # that will be used to have temporary overwrite of all the key variable passed as string by their value.
+    # all variable that are file related should be called as madgraph.dir.file.variable
+    def add_madevent_plugin_fct(self):
+        """this consist in a file plugin_interface.py
+        which contains a series of functions and one dictionary variable TO_OVERWRITE
+        that will be used to have temporary overwrite of all the key variable passed as string by their value.
+        all variable that are file related should be called as madgraph.dir.file.variable
+        """
+        plugin_path = os.path.dirname(os.path.realpath( __file__ ))
+        files.cp(pjoin(plugin_path, 'launch_plugin.py'), pjoin(self.dir_path, 'bin', 'internal'))
+        files.ln(pjoin(self.dir_path, 'lib'),  pjoin(self.dir_path, 'SubProcesses'))
+
+#------------------------------------------------------------------------------------
+
+class PLUGIN_ProcessExporter_MadEvent(PLUGIN_ProcessExporter):
+    """ a class to include all tweak related to madevent and not related to standalone.
+        in practise this class is never called but only the SIMD or GPU related class"""
+
+    s = PLUGINDIR + '/madgraph/iolibs/template_files/'
+    # add template file/ linking only needed in the madevent mode and not in standalone
+    from_template = dict(PLUGIN_ProcessExporter.from_template)
+    from_template['SubProcesses'] = from_template['SubProcesses'] + [s+'gpu/fbridge_common.inc',
+                                      s+'gpu/counters.cc',
+                                      s+'gpu/ompnumthreads.cc']
+
+    to_link_in_P = PLUGIN_ProcessExporter.to_link_in_P + ['fbridge_common.inc', 'counters.cc','ompnumthreads.cc']
+
+#------------------------------------------------------------------------------------
+
+class SIMD_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
+
+    # Default class for the run_card to use
+    run_card_class = launch_plugin.CPPRunCard
+
+    def change_output_args(args, cmd):
+        """ """
+        #cmd._export_format = "madevent_forplugin"
+        cmd._export_format = 'madevent'
+        cmd._export_plugin = FortranExporterBridge
+        args.append('--hel_recycling=False')
+        args.append('--me_exporter=standalone_simd')
+        if 'vector_size' not in ''.join(args):
+            args.append('--vector_size=16')
+        if 'nb_wrap' not in ''.join(args):
+            args.append('--nb_wrap=1')
+        return args
+
+class FortranExporterBridge(export_v4.ProcessExporterFortranMEGroup):
+    _file_path = export_v4._file_path
+
+    def write_auto_dsig_file(self, writer, matrix_element, proc_id = ""):
+        replace_dict,context = super().write_auto_dsig_file(False, matrix_element, proc_id)
+        replace_dict['additional_header'] = """
+      INTEGER IEXT
+
+      INTEGER                    ISUM_HEL
+      LOGICAL                    MULTI_CHANNEL
+      COMMON/TO_MATRIX/ISUM_HEL, MULTI_CHANNEL
+
+      LOGICAL FIRST_CHID
+      SAVE FIRST_CHID
+      DATA FIRST_CHID/.TRUE./
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      INCLUDE 'coupl.inc' ! for ALL_G
+      INCLUDE 'fbridge.inc'
+      INCLUDE 'fbridge_common.inc'
+      INCLUDE 'genps.inc'
+      INCLUDE 'run.inc'
+      DOUBLE PRECISION OUT2(VECSIZE_MEMMAX)
+      INTEGER SELECTED_HEL2(VECSIZE_MEMMAX)
+      INTEGER SELECTED_COL2(VECSIZE_MEMMAX)
+      DOUBLE PRECISION CBYF1
+      INTEGER*4 NGOODHEL, NTOTHEL
+
+      INTEGER*4 NWARNINGS
+      SAVE NWARNINGS
+      DATA NWARNINGS/0/
+
+      LOGICAL FIRST
+      SAVE FIRST
+      DATA FIRST/.TRUE./"""
+        replace_dict['OMP_LIB'] = ''
+        replace_dict['OMP_PREFIX'] = """IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
+#endif
+CALL COUNTERS_SMATRIX1MULTI_START( -1, VECSIZE_USED )  ! fortranMEs=-1"""
+        replace_dict["OMP_POSTFIX"] = open(pjoin(PLUGINDIR,'madgraph','iolibs','template_files','gpu','smatrix_multi.f')).read().split('\n',4)[4] # AV skip 4 copyright lines
+        if writer:
+            file = open(pjoin(self._file_path, 'iolibs/template_files/auto_dsig_v4.inc')).read()
+            file = file % replace_dict
+            # Write the file
+            writer.writelines(file, context=context)
+        else:
+            return replace_dict, context
+
+    def write_driver(self, writer, *args, **kwargs):
+        """Write the SubProcess/driver.f file with additions from CUDACPP"""
+        replace_dict = super().write_driver(False, *args, **kwargs)
+
+        # Additions from CUDACPP plugin (after patch)
+        replace_dict['DRIVER_EXTRA_HEADER'] += """
+      character*255 env_name, env_value
+      integer env_length, env_status
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      INCLUDE 'fbridge.inc'
+c     INCLUDE 'fbridge_common.inc'
+#endif
+      INCLUDE 'fbridge_common.inc'
+"""
+
+        replace_dict['DRIVER_EXTRA_INITIALISE'] += """
+#ifdef _OPENMP
+      CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD()
+#endif
+      CALL COUNTERS_INITIALISE()
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      fbridge_mode = 1 ! CppOnly=1, default for CUDACPP
+#else
+      fbridge_mode = 0 ! FortranOnly=0, default for FORTRAN
+#endif
+      env_name = 'CUDACPP_RUNTIME_FBRIDGEMODE'
+      call get_environment_variable(env_name, env_value, env_length, env_status)
+      if( env_status.eq.0 ) then
+        write(*,*) 'Found environment variable "', trim(env_name), '" with value "', trim(env_value), '"'
+        read(env_value,'(I255)') FBRIDGE_MODE ! see https://gcc.gnu.org/onlinedocs/gfortran/ICHAR.html
+        write(*,*) 'FBRIDGE_MODE (from env) = ', FBRIDGE_MODE
+      else if( env_status.eq.1 ) then ! 1 = not defined
+        write(*,*) 'FBRIDGE_MODE (default) = ', FBRIDGE_MODE
+      else ! -1 = too long for env_value, 2 = not supported by O/S
+        write(*,*) 'ERROR! get_environment_variable failed for "', trim(env_name), '"'
+        STOP
+      endif
+#ifndef MG5AMC_MEEXPORTER_CUDACPP
+      if( fbridge_mode.ne.0 ) then
+        write(*,*) 'ERROR! Invalid fbridge_mode (in FORTRAN backend mode) = ', fbridge_mode
+        STOP
+      endif
+#endif
+
+      env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
+      call get_environment_variable(env_name, env_value, env_length, env_status)
+      if( env_status.eq.0 ) then
+        write(*,*) 'Found environment variable "', trim(env_name), '" with value "', trim(env_value), '"'
+        read(env_value,'(I255)') VECSIZE_USED ! see https://gcc.gnu.org/onlinedocs/gfortran/ICHAR.html
+        write(*,*) 'VECSIZE_USED (from env) = ', VECSIZE_USED
+      else if( env_status.eq.1 ) then ! 1 = not defined
+        write(*,*) 'VECSIZE_USED (default) = ', VECSIZE_USED
+      else ! -1 = too long for env_value, 2 = not supported by O/S
+        write(*,*) 'ERROR! get_environment_variable failed for "', trim(env_name), '"'
+        STOP
+      endif
+      if( VECSIZE_USED.gt.VECSIZE_MEMMAX .or. VECSIZE_USED.le.0 ) then
+        write(*,*) 'ERROR! Invalid VECSIZE_USED = ', VECSIZE_USED
+        STOP
+      endif
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      CALL FBRIDGECREATE(FBRIDGE_PBRIDGE, VECSIZE_USED, NEXTERNAL, 4) ! this must be at the beginning as it initialises the CUDA device
+      FBRIDGE_NCBYF1 = 0
+      FBRIDGE_CBYF1SUM = 0
+      FBRIDGE_CBYF1SUM2 = 0
+      FBRIDGE_CBYF1MAX = -1D100
+      FBRIDGE_CBYF1MIN = 1D100
+#endif
+"""
+
+        replace_dict['DRIVER_EXTRA_FINALISE'] += """
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      CALL FBRIDGEDELETE(FBRIDGE_PBRIDGE) ! this must be at the end as it shuts down the CUDA device
+      IF( FBRIDGE_MODE .LE. -1 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
+        WRITE(*,'(a,f10.8,a,e8.2)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: MIN = ',
+     &    FBRIDGE_CBYF1MIN + 1, ' = 1 - ', -FBRIDGE_CBYF1MIN
+        WRITE(*,'(a,f10.8,a,e8.2)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: MAX = ',
+     &    FBRIDGE_CBYF1MAX + 1, ' = 1 + ', FBRIDGE_CBYF1MAX
+        WRITE(*,'(a,i6)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: NENTRIES = ',
+     &    FBRIDGE_NCBYF1
+c        WRITE(*,'(a,e8.2)')
+c    &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: AVG = ',
+c    &    FBRIDGE_CBYF1SUM / FBRIDGE_NCBYF1
+c       WRITE(*,'(a,e8.2)')
+c    &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: STD = ',
+c    &    SQRT( FBRIDGE_CBYF1SUM2 / FBRIDGE_NCBYF1 ) ! ~standard deviation
+        WRITE(*,'(a,e8.2,a,e8.2)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: AVG = ',
+     &    FBRIDGE_CBYF1SUM / FBRIDGE_NCBYF1, ' +- ',
+     &    SQRT( FBRIDGE_CBYF1SUM2 ) / FBRIDGE_NCBYF1 ! ~standard error
+      ENDIF
+#endif
+      CALL COUNTERS_FINALISE()
+"""
+
+        if writer:
+            text = open(pjoin(self._file_path,'iolibs','template_files','madevent_driver.f')).read() % replace_dict
+            writer.write(text)
+            return True
+        return replace_dict
+#------------------------------------------------------------------------------------
+
+class GPU_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
+
+    # Default class for the run_card to use
+    run_card_class = launch_plugin.GPURunCard
+
+    def change_output_args(args, cmd):
+        """ """
+        cmd._export_format = 'madevent'
+        cmd._export_plugin = FortranExporterBridge
+
+        args.append('--hel_recycling=False')
+        args.append('--me_exporter=standalone_cuda')
+        if 'vector_size' not in ''.join(args):
+            args.append('--vector_size=32')
+        if 'nb_wrap' not in ''.join(args):
+            args.append('--nb_wrap=512')
+        return args
+
+    def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
+        misc.sprint("enter dedicated function")
+        out = super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
+        # OM change RunCard class to have default for GPU
+        text = open(pjoin(self.dir_path, 'bin', 'internal', 'launch_plugin.py'), 'r').read()
+        text = text.replace('RunCard = CPPRunCard', 'RunCard = GPURunCard')
+        open(pjoin(self.dir_path, 'bin', 'internal', 'launch_plugin.py'), 'w').write(text)
+        return out
+
+#------------------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/trex.py b/PLUGIN/CUDACPP_OUTPUT/trex.py
new file mode 100644
index 0000000000..53e0817256
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/trex.py
@@ -0,0 +1,820 @@
+# Copyright (C) 2023-2025 CERN.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: Z. Wettersten (Sep 2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
+
+import os
+import subprocess
+import re
+import sys
+import importlib.util
+
+# AV - PLUGIN_NAME can be one of PLUGIN/CUDACPP_OUTPUT or MG5aMC_PLUGIN/CUDACPP_OUTPUT
+PLUGIN_NAME = __name__.rsplit('.',1)[0]
+
+# AV - use templates for source code, scripts and Makefiles from PLUGINDIR instead of MG5DIR
+###from madgraph import MG5DIR
+PLUGINDIR = os.path.dirname( __file__ )
+
+__import__('%s.output'%PLUGIN_NAME)
+output = sys.modules['%s.output'%PLUGIN_NAME]
+__import__('%s.model_handling'%PLUGIN_NAME)
+model_handling = sys.modules['%s.model_handling'%PLUGIN_NAME]
+
+import importlib.util
+SPEC_EXPORTCPP = importlib.util.find_spec('madgraph.iolibs.export_cpp')
+PLUGIN_export_cpp = importlib.util.module_from_spec(SPEC_EXPORTCPP)
+SPEC_EXPORTCPP.loader.exec_module(PLUGIN_export_cpp)
+###sys.modules['PLUGIN.CUDACPP_OUTPUT.PLUGIN_export_cpp'] = PLUGIN_export_cpp # allow 'import PLUGIN.CUDACPP_OUTPUT.PLUGIN_export_cpp' in model_handling.py
+sys.modules['%s.PLUGIN_export_cpp'%PLUGIN_NAME] = PLUGIN_export_cpp # allow 'import <PLUGIN_NAME>.PLUGIN_export_cpp' in model_handling.py
+del SPEC_EXPORTCPP
+
+# AV - create a plugin-specific logger
+import logging
+logger = logging.getLogger('madgraph.%s.model_handling'%PLUGIN_NAME)
+
+#------------------------------------------------------------------------------------
+
+
+#------------------------------------------------------------------------------------
+
+from os.path import join as pjoin
+import madgraph
+import madgraph.iolibs.files as files
+import madgraph.iolibs.export_v4 as export_v4
+import madgraph.various.misc as misc
+import madgraph.interface.reweight_interface as rwgt_interface
+import madgraph.various.banner as banner
+import models.check_param_card as check_param_card
+import madgraph.interface.extended_cmd as extended_cmd
+import madgraph.interface.common_run_interface as common_run_interface
+
+from . import launch_plugin
+
+class TREX_OneProcessExporter(model_handling.PLUGIN_OneProcessExporter):
+    """A custom OneProcessExporter for the TREX reweighting"""
+    
+    rex_path = os.path.join( PLUGINDIR, 'MadtRex' )
+    
+    # ZW - rwgt functions
+    def get_rwgt_legs(self, process):
+        """Return string with particle ids and status in the REX std::pair format"""
+        return ",".join(["{%i,%i}" % (leg.get('state'), leg.get('id')) \
+            for leg in process.get('legs')]).replace('0', '-1')
+    
+    def get_rwgt_legs_status(self, process):
+        """Return string with particle statuses as a C++ vector<int>"""
+        return "{" + ",".join(["%i" % leg.get('state') for leg in process.get('legs')]).replace('0', '-1') + "}"
+
+    def get_rwgt_legs_pdg(self, process):
+        """Return string with particle PDG IDs as a C++ vector<int>"""
+        return "{" + ",".join(["%i" % leg.get('id') for leg in process.get('legs')]) + "}"
+
+    def get_rwgt_status_vec(self, processes):
+        """Return string with vectors of particle statuses"""
+        prtSets = []
+        for k in range(len(processes)):
+            prtSets.append( self.get_rwgt_legs_status(processes[k]))
+        return ",".join(prtSets)
+    
+    def get_rwgt_pdg_vec(self, processes):
+        """Return string with vectors of particle PDG IDs"""
+        prtSets = []
+        for k in range(len(processes)):
+            prtSets.append(self.get_rwgt_legs_pdg(processes[k]))
+        return ",".join(prtSets)
+
+    def get_rwgt_legs_vec(self, processes):
+        """Return string with vectors of particle ids and statuses"""
+        prtSets = []
+        for k in range(len(processes)):
+            prtSets.append("{" + self.get_rwgt_legs(processes[k]) + "}")
+        return ",".join(prtSets)
+        
+    def get_init_prts_vec(self, process):
+        """Return string with initial state particle ids for use in REX event sorting"""
+        prts = ",".join(["%i" % leg.get('id') for leg in process.get('legs') if leg.get('state') == 0])
+        return "{" + prts + "}"
+    
+    def get_init_prts_vecs(self, processes):
+        """Return string with vectors of initial state particle ids"""
+        prtSets = []
+        for k in range(len(processes)):
+            prtSets.append(self.get_init_prts_vec(processes[k]))
+        return ",".join(prtSets)
+    
+    def get_fin_prts_vec(self, process):
+        """Return string with final state particle ids for use in REX event sorting"""
+        prts = ",".join(["%i" % leg.get('id') for leg in process.get('legs') if leg.get('state') == 1])
+        return "{" + prts + "}"
+    
+    def get_fin_prts_vecs(self, processes):
+        """Return string with vectors of final state particle ids"""
+        prtSets = []
+        for k in range(len(processes)):
+            prtSets.append(self.get_fin_prts_vec(processes[k]))
+        return ",".join(prtSets)
+        
+    def get_rwgt_procMap(self, process):
+        """Return string with particle states and order in the REX procMap format"""
+        currState = False
+        retString = "thisProc{{-1,{"
+        for leg in process.get('legs'):
+            if currState == leg.get('state'):
+                retString += "%i," % leg.get('id')
+            else:
+                currState = leg.get('state')
+                retString += "}},{1,{%i," % leg.get('id')
+        retString = retString[:-1] + "}}}"
+        return retString
+    
+    def get_proc_dir(self):
+        """Return process directory name for the current process"""
+        return "P%d_%s" % (self.process_number, self.process_name)
+        
+    def get_rwgt_runner(self):
+        """Return string to initialise the rwgtRunners in tRex"""
+        return "%s::runner" % (self.get_proc_dir())
+    
+    def get_rwgt_includes(self):
+        """Return string with the include directives for the tRex reweighting"""
+        return "#include \"P%d_%s/rwgt_runner.cc\"" % (self.process_number, self.process_name)
+    
+    def write_rwgt_header(self):
+        """Writes a simple rwgt_runner.h file to forward declare the runner object"""
+        # Adjust the placeholders for use with `.format()`
+        rwgt_h = """#ifndef {namespace}_RWGT_RUNNER_H
+    #define {namespace}_RWGT_RUNNER_H
+    #include \"rwgt_instance.h\"
+    namespace {namespace} {{
+        extern rwgt::instance runner;
+    }}
+    #endif""".format(namespace=self.get_proc_dir())
+        
+        # Using `with` statement for better file handling
+        with open(os.path.join(self.path, 'rwgt_runner.h'), 'w') as ff:
+            ff.write(rwgt_h)
+    
+    def edit_rwgt_header(self):
+        """Adds process-specific details to the rwgt_runner.h template"""
+        replace_dict = super().get_process_class_definitions(write=False)
+        replace_dict['process_namespace'] = self.get_proc_dir()
+        replace_dict['info_lines'] = model_handling.PLUGIN_export_cpp.get_mg5_info_lines()
+        template = open(pjoin(self.rex_path,'template_files', 'rwgt_runner_h.inc'),'r').read()
+        ff = open(pjoin(self.path, 'rwgt_runner.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+    
+    def edit_rwgt_runner(self):
+        """Create the rwgt_runner.cc file for the tRex reweighting"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporterRwgt.edit_rwgt_runner')
+        # Create the rwgt_runner.cc file
+        replace_dict = super().get_process_class_definitions(write=False)
+        replace_dict['process_namespace'] = self.get_proc_dir()
+        replace_dict['info_lines'] = model_handling.PLUGIN_export_cpp.get_mg5_info_lines()
+        replace_dict['parton_ids'] = self.get_rwgt_pdg_vec(self.matrix_elements[0].get('processes'))
+        replace_dict['parton_status'] = self.get_rwgt_status_vec(self.matrix_elements[0].get('processes'))
+        replace_dict['no_events'] = len(self.matrix_elements[0].get('processes'))
+        template = open(pjoin(self.rex_path,'template_files', 'rwgt_runner_cc.inc'),'r').read()
+        ff = open(pjoin(self.path, 'rwgt_runner.cc'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+    
+    # ZW - override the PLUGIN method to generate the rwgt_runner.cc file as well
+    # note: also generating standard check_sa.cc and gcheck_sa.cu files, which
+    # are not used in the REX reweighting
+    def generate_process_files(self):
+        """Generate mgOnGpuConfig.h, CPPProcess.cc, CPPProcess.h, check_sa.cc, gXXX.cu links"""
+        super().generate_process_files()
+        self.edit_rwgt_header()
+        self.edit_rwgt_runner()
+        
+class TREX_ProcessExporter(output.PLUGIN_ProcessExporter):
+    
+    oneprocessclass = TREX_OneProcessExporter
+    
+    rwgt_names = []
+    proc_lines = []
+    
+    s = PLUGINDIR + '/madgraph/iolibs/template_files/'
+    t = PLUGINDIR + '/MadtRex/'
+    r = PLUGINDIR + '/MadtRex/template_files/'
+    m = PLUGINDIR + '/MadtRex/makefiles/'
+    from_template = dict(output.PLUGIN_ProcessExporter.from_template)
+    from_template['src'] = from_template['src'] + [t+'librex.so', t+'libtearex.so',
+                                                   t+'Rex.h', t+'teaRex.h',
+                                                    r+'rwgt_instance.h', r+'rwgt_instance.cc']
+    from_template['SubProcesses'] = from_template['SubProcesses'] + [m+'cudacpp_driver.mk',
+                                                                     r+'rwgt_instance.h', t+'Rex.h', t+'teaRex.h']
+
+    to_link_in_P = output.PLUGIN_ProcessExporter.to_link_in_P + ['rwgt_instance.h', 'Rex.h', 'teaRex.h']
+
+    template_src_make = pjoin(m,'cudacpp_rex_src.mk')
+    # template_tst_make = pjoin(m,'cudacpp_test.mk')
+    template_Sub_make = pjoin(m,'cudacpp_runner.mk')
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.template_path = PLUGINDIR
+        self.rwgt_names = []
+        self.proc_lines = []
+        self.compile_library()
+    
+    def check_library(self, temp_dir=None):
+        """Checks whether librex.so and libtearex.so exist in the rex template directory"""
+        if temp_dir is None:
+            temp_dir = pjoin(PLUGINDIR, 'MadtRex')
+        if not os.path.exists(pjoin(temp_dir, 'librex.so')):
+            return False
+        if not os.path.exists(pjoin(temp_dir, 'libtearex.so')):
+            return False
+        return True
+
+    def compile_library(self):
+        """Compile librex.so and libtearex.so in the rex template directory"""
+        rex_template_dir = pjoin(PLUGINDIR, 'MadtRex')
+        if not self.check_library(rex_template_dir):
+            logger.info('Rex or teaRex libraries not found, compiling them now...')
+            logger.info('This may take a while, but only needs to be done once.')
+            misc.compile(arg=['-f', 'rex.mk'],cwd=rex_template_dir,mode='cpp')
+        return
+    
+    def generate_subprocess_directory(self, matrix_element, cpp_helas_call_writer,
+                                      proc_number=None):
+        """Generate the Pxxxxx directory for a subprocess in C++ standalone,
+        including the necessary .h and .cc files"""
+
+        
+        process_exporter_cpp = self.oneprocessclass(matrix_element,cpp_helas_call_writer)
+        
+        self.rwgt_names.append("P%d_%s" % (process_exporter_cpp.process_number, 
+                                             process_exporter_cpp.process_name))
+        
+        process_lines = "\n".join([process_exporter_cpp.get_process_info_lines(me) for me in \
+                                   process_exporter_cpp.matrix_elements])
+        self.proc_lines.append(process_lines)
+        
+        # Create the directory PN_xx_xxxxx in the specified path
+        dirpath = pjoin(self.dir_path, 'SubProcesses', "P%d_%s" % (process_exporter_cpp.process_number, 
+                                             process_exporter_cpp.process_name))
+        try:
+            os.mkdir(dirpath)
+        except os.error as error:
+            logger.warning(error.strerror + " " + dirpath)
+    
+        with misc.chdir(dirpath):
+            logger.info('Creating files in directory %s' % dirpath)
+            process_exporter_cpp.path = dirpath
+            # Create the process .h and .cc files
+            process_exporter_cpp.generate_process_files()
+            for file in self.to_link_in_P:
+                files.ln('../%s' % file) 
+        return
+    
+    def export_driver(self):
+        replace_dict = {}
+        replace_dict['info_lines'] = model_handling.PLUGIN_export_cpp.get_mg5_info_lines()
+        replace_dict['multiprocess_lines'] = "\n".join(self.proc_lines)
+        replace_dict['include_lines'] = ''
+        replace_dict['make_rwgt'] = ''
+        for name in self.rwgt_names:
+            replace_dict['include_lines'] += '#include "%s/rwgt_runner.h"\n' % name
+            replace_dict['make_rwgt'] += '%s::make_reweightor(batch_size),' % name
+        replace_dict['make_rwgt'] = replace_dict['make_rwgt'][:-1]
+        template_path = os.path.join( PLUGINDIR, 'madgraph', 'iolibs', 'template_files' )
+        template = open(pjoin(self.r, 'rwgt_driver.inc'),'r').read()
+        ff = open(pjoin(self.dir_path, 'SubProcesses', 'rwgt_driver.cc'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+    
+    def link_makefile(self):
+        """Link the makefile for the tRex reweighting"""
+        files.ln(pjoin(self.dir_path, 'SubProcesses', 'cudacpp_driver.mk'), starting_dir=pjoin(self.dir_path, 'SubProcesses'), name='makefile')
+    
+    def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
+        self.export_driver()
+        self.link_makefile()
+        return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
+    
+class TREX_ReweightInterface(rwgt_interface.ReweightInterface):
+    """A custom ReweightInterface for the tRex reweighting"""
+    
+    prompt = 'MadtRex>'
+    sa_class = 'standalone_trex'
+    debug_output = 'tRex_debug'
+    
+    def __init__(self, *args, **kwargs):
+        """Initialise the tRex reweighting interface
+        Currently no (substantial) changes compared to upstream are necessary,
+        but adding an __init__ method allows for future modifications"""
+        super().__init__(*args, **kwargs)
+        self.param_card = None
+        self.reweight_card = []
+        self.reweight_names = []
+        self.backend = 'cppauto' # default backend for tRex reweighting
+        self.fptype = 'm' # default floating point type for tRex reweighting
+        self.nb_threads = 1
+        if self.mother:
+            self.nb_threads = self.mother.options['nb_core'] if self.mother.options['run_mode'] !=0 else 1
+        else:
+            self.nb_threads = 1
+        self.batch_size = 32
+        self.compile_library()
+
+    def check_library(self, temp_dir=None):
+        """Checks whether librex.so and libtearex.so exist in the rex template directory"""
+        if temp_dir is None:
+            temp_dir = pjoin(PLUGINDIR, 'MadtRex')
+        if not os.path.exists(pjoin(temp_dir, 'librex.so')):
+            return False
+        if not os.path.exists(pjoin(temp_dir, 'libtearex.so')):
+            return False
+        return True
+
+    def compile_library(self):
+        """Compile librex.so and libtearex.so in the rex template directory"""
+        if self.multicore=='wait':
+            return
+        rex_template_dir = pjoin(PLUGINDIR, 'MadtRex')
+        if not self.check_library(rex_template_dir):
+            misc.compile(arg=['-f', 'rex.mk'],cwd=rex_template_dir,mode='cpp')
+        return
+
+    def setup_f2py_interface(self):
+        """"Override native setup_f2py_interface to avoid parsing things not necessary for tRex reweighting"""
+        self.create_standalone_directory()
+        self.compile()
+        
+    def launch_actual_reweighting(self, *args, **kwargs):
+        """override standard launch command to instead call the tRex reweighting"""
+        
+        import csv
+        
+        if self.rwgt_dir:
+            path_me =self.rwgt_dir
+        else:
+            path_me = self.me_dir 
+        
+        if self.second_model or self.second_process or self.dedicated_path:
+            rw_dir = pjoin(path_me, 'rw_me_%s' % self.nb_library)
+        else:
+            rw_dir = pjoin(path_me, 'rw_me')
+
+        run_path = pjoin(rw_dir, 'SubProcesses')
+        input_file = os.path.relpath(self.lhe_input.path, run_path)
+        output_file = input_file + 'rw'
+        output_path = self.lhe_input.path + 'rw'
+        param_card = pjoin(rw_dir, 'Cards', 'param_card.dat')
+
+        #ZW: Exceptions, making sure all the necessary files for teaREX are accessible
+        if( misc.is_executable(pjoin(run_path,'rwgt_driver_gpu.exe')) ):
+            driver = pjoin(run_path, 'rwgt_driver_gpu.exe')
+        elif(misc.is_executable(pjoin(run_path,'rwgt_driver_cpp.exe')) ):
+            driver = pjoin(run_path,'rwgt_driver_cpp.exe')
+        else:
+            raise Exception('No MadtRex driver found for parallel reweighting')
+        if not os.path.exists(param_card):
+            try:
+                files.cp(os.path.join(path_me, 'Cards', 'param_card_default.dat'), param_card)
+            except:
+                raise Exception("No param_card.dat file found in %s" % pjoin(path_me, 'Cards'))
+        param_path = os.path.relpath(param_card, run_path)    
+
+        rwgt_card = os.path.join(path_me, 'Cards', 'reweight_card.dat')
+        
+        self.write_reweight_card(rwgt_card)
+
+        if not os.path.exists(rwgt_card):
+            try:
+                files.cp(os.path.join(path_me, 'Cards', 'reweight_card_default.dat'), rwgt_card)
+            except:
+                raise Exception("No reweight_card.dat file found in %s" % pjoin(path_me, 'Cards'))
+        rwgt_path = os.path.relpath(rwgt_card, run_path)
+        target = ''
+        if not self.mother:
+            name, ext = self.lhe_input.name.rsplit('.',1)
+            target = '%s_out.%s' % (name, ext)            
+        elif self.output_type != "default" :
+            target = pjoin(self.mother.me_dir, 'Events', self.mother.run_name, 'events.lhe')
+        else:
+            target = self.lhe_input.path
+        n_threads = self.nb_threads
+        if n_threads < 1:
+            n_threads = 1
+        nb_warps = self.batch_size
+        if nb_warps < 1:
+            nb_warps = 32
+
+        #ZW: rwgt_driver is written and compiled properly, now just to figure out how to run it through MG
+        subprocess.call([driver, '-lhe=%s' % input_file, '-slha=%s' % param_card, '-rwgt=%s' % rwgt_card, '-out=%s' % output_file, '-nt=%s' % n_threads, '-warp=%s' % nb_warps], cwd=run_path)
+
+        # ZW: check if output exists, if not nicely raise an exception
+        if not os.path.exists(output_path):
+            if os.path.exists(target):
+                files.mv(self.lhe_input.path, target)
+                logger.info('Error in reweighting: output file not found. Returning original LHE file.')
+                return
+            else:
+                raise Exception('Error in reweighting: output file not found. Input file not found. Exiting.')
+        else:
+            files.mv(output_path, target)
+            csv_file = pjoin(run_path, 'rwgt_results.csv')
+            with open(csv_file, newline='') as results:
+                iters = csv.reader(results)
+                for row in iters:
+                    self.all_cross_section[(row[0],'')] = (float(row[1]), float(row[2]))
+
+            return
+    
+    def compile(self):
+        """override compile to use the TREX makefiles"""
+        
+        if self.multicore=='wait':
+            return
+        
+        if not self.rwgt_dir:
+            path_me = self.me_dir
+        else:
+            path_me = self.rwgt_dir
+        
+        rwgt_dir_possibility =   ['rw_me','rw_me_%s' % self.nb_library,'rw_mevirt','rw_mevirt_%s' % self.nb_library]
+        for onedir in rwgt_dir_possibility:
+            if not os.path.isdir(pjoin(path_me,onedir)):
+                continue
+            pdir = pjoin(path_me, onedir, 'SubProcesses')
+            if self.mother:
+                nb_core = self.mother.options['nb_core'] if self.mother.options['run_mode'] !=0 else 1
+            else:
+                nb_core = 1
+            misc.compile(arg=['BACKEND=%s' % self.backend,'FPTYPE=%s' % self.fptype],cwd=pdir, nb_core=nb_core,mode='cpp')
+        return
+    
+    def load_module(self):
+        """override load_module since we do not use it"""
+        return
+    
+    # def import_command_file(self, filepath):
+    #     """override import_command_file to simply launch TREX"""
+    #     self.exec_cmd('launch', precmd=True)
+    #     return
+    
+    def do_change(self,line):
+        """extend do_change to include the backend and floating point type options"""
+        args = self.split_arg(line)
+        if len(args) < 2:
+            return super().do_change(line)
+        if args[0].lower() in ['cudacpp_backend', 'backend']:
+            cudacpp_supported_backends = [ 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
+            if args[1].lower() in cudacpp_supported_backends:
+                self.backend = args[1].lower()
+                logger.info("Setting tRex reweighting backend to '%s'" % self.backend)
+            else:
+                raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%(args[1], cudacpp_supported_backends) )
+        elif args[0].lower() in ['fptype', 'floating_type']:
+            fptype_supported = ['f', 'm', 'd']
+            fptype_semisupported = ['float', 'mixed', 'double']
+            if args[1].lower() in fptype_semisupported:
+                args[1] = args[1].lower()[0]
+            if args[1].lower() in fptype_supported:
+                self.fptype = args[1].lower()
+                logger.info("Setting tRex reweighting floating point type to '%s'" % self.fptype)
+            else:
+                raise Exception( "Invalid fptype='%s': supported types are %s"%(args[1], fptype_supported) )
+        elif args[0].lower() in ['nb_core', 'nb_threads']:
+            try:
+                self.nb_threads = int(args[1])
+                if self.nb_threads < 1:
+                    raise ValueError
+                logger.info("Setting tRex reweighting number of threads to '%s'" % self.nb_threads)
+            except ValueError:
+                raise Exception( "Invalid nb_core='%s': must be a positive integer"%(args[1]) )
+        elif args[0].lower() in ['batch_size', 'nb_batch', 'vector_size', 'vec_size', 'nb_warp', 'warp_size']:
+            try:
+                self.batch_size = int(args[1])
+                if self.batch_size < 1:
+                    raise ValueError
+                logger.info("Setting tRex reweighting batch size to '%s'" % self.batch_size)
+            except ValueError:
+                raise Exception( "Invalid batch_size='%s': must be a positive integer"%(args[1]) )
+        else:
+            return super().do_change(line)
+        return
+        
+    def do_launch(self, line):
+        """override do_launch to instead overwrite the reweight_card
+        to fit the expected input for TREX without having to extend TREX itself"""
+        args = self.split_arg(line)
+        opts = self.check_launch(args)
+        mgcmd = self.mg5cmd
+        if opts['rwgt_name']:
+            self.options['rwgt_name'] = opts['rwgt_name']
+        if opts['rwgt_info']:
+            self.options['rwgt_info'] = opts['rwgt_info']
+        model_line = self.banner.get('proc_card', 'full_model_line')
+
+        # TV: Load model: needed for the combine_ij function: maybe not needed everyt time??? 
+        model = self.banner.get('proc_card', 'model')
+        self.load_model( model, True, False)
+
+        if not self.has_standalone_dir:
+            out = self.setup_f2py_interface()
+            if out:
+                return
+
+        if not self.param_card:
+            s_orig = self.banner['slha']
+            self.param_card = check_param_card.ParamCard(s_orig.splitlines())
+
+        # get the mode of reweighting #LO/NLO/NLO_tree/...
+        type_rwgt = self.get_weight_names()
+        
+        if self.rwgt_dir:
+            path_me =self.rwgt_dir
+        else:
+            path_me = self.me_dir 
+        
+            
+        # get iterator over param_card and the name associated to the current reweighting.
+        param_card_iterator, tag_name = self.handle_param_card(model_line, args, type_rwgt)
+
+        self.reweight_names.append(tag_name)
+        
+        # perform the scanning
+        if param_card_iterator:
+            if self.options['rwgt_name']:
+                reweight_name = self.options['rwgt_name'].rsplit('_',1)[0] # to avoid side effect during the scan
+            else:
+                reweight_name = None
+            for i,card in enumerate(param_card_iterator):
+                if reweight_name:
+                    self.options['rwgt_name'] = '%s_%s' % (reweight_name, i+1)
+                self.new_param_card = card
+                #card.write(pjoin(rw_dir, 'Cards', 'param_card.dat'))
+                self.exec_cmd("launch --keep_card", printcmd=False, precmd=True)
+    
+    def check_multicore(self):
+        """override check_multicore to overloading the CPU (we never want to run TREX in multicore mode)"""
+        return False
+    
+    def handle_param_card(self, model_line, args, type_rwgt):
+        """override handle_param_card to get rid of all the unnecessary checks and file writing
+        now simply loads the param_card and uses get_diff to tranlate into internal format"""    
+
+        if self.rwgt_dir:
+            path_me =self.rwgt_dir
+        else:
+            path_me = self.me_dir 
+            
+        if self.second_model or self.second_process or self.dedicated_path:
+            rw_dir = pjoin(path_me, 'rw_me_%s' % self.nb_library)
+        else:
+            rw_dir = pjoin(path_me, 'rw_me')
+        if not '--keep_card' in args:
+            if self.has_nlo and self.rwgt_mode != "LO":
+                rwdir_virt = rw_dir.replace('rw_me', 'rw_mevirt')
+            with open(pjoin(rw_dir, 'Cards', 'param_card.dat'), 'w') as fsock:
+                fsock.write(self.banner['slha']) 
+            out, cmd = common_run_interface.CommonRunCmd.ask_edit_card_static(cards=['param_card.dat'],
+                                ask=self.ask, pwd=rw_dir, first_cmd=self.stored_line,
+                                write_file=False, return_instance=True
+                                )
+            self.stored_line = None
+            card = cmd.param_card
+            new_card = card.write()
+        elif self.new_param_card:
+            new_card = self.new_param_card.write()
+        else:
+            new_card = open(pjoin(rw_dir, 'Cards', 'param_card.dat')).read()
+        
+        # check for potential scan in the new card 
+        pattern_scan = re.compile(r'''^(decay)?[\s\d]*scan''', re.I+re.M) 
+        param_card_iterator = []
+        if pattern_scan.search(new_card):
+            import madgraph.interface.extended_cmd as extended_cmd
+            try:
+                import internal.extended_cmd as extended_internal
+                Shell_internal = extended_internal.CmdShell
+            except:
+                Shell_internal = None
+            if not isinstance(self.mother, (extended_cmd.CmdShell, Shell_internal)): 
+                raise Exception("scan are not allowed on the Web")
+            # at least one scan parameter found. create an iterator to go trough the cards
+            main_card = check_param_card.ParamCardIterator(new_card)
+            if self.options['rwgt_name']:
+                self.options['rwgt_name'] = '%s_0' % self.options['rwgt_name']
+
+            param_card_iterator = main_card
+            first_card = param_card_iterator.next(autostart=True)
+            new_card = first_card.write()
+            self.new_param_card = first_card
+            #first_card.write(pjoin(rw_dir, 'Cards', 'param_card.dat'))  
+
+        # check if "Auto" is present for a width parameter)
+        if 'block' not in new_card.lower():
+            raise Exception(str(new_card))
+        tmp_card = new_card.lower().split('block',1)[1]
+        if "auto" in tmp_card: 
+            if param_card_iterator:
+                first_card.write(pjoin(rw_dir, 'Cards', 'param_card.dat'))
+            else:
+                ff = open(pjoin(rw_dir, 'Cards', 'param_card.dat'),'w')
+                ff.write(new_card)
+                ff.close()
+                
+            self.mother.check_param_card(pjoin(rw_dir, 'Cards', 'param_card.dat'))
+            new_card = open(pjoin(rw_dir, 'Cards', 'param_card.dat')).read()
+
+
+        # Find new tag in the banner and add information if needed
+        if 'initrwgt' in self.banner and self.output_type == 'default': 
+            if 'name=\'mg_reweighting\'' in self.banner['initrwgt']:
+                blockpat = re.compile(r'''<weightgroup name=\'mg_reweighting\'\s*weight_name_strategy=\'includeIdInWeightName\'>(?P<text>.*?)</weightgroup>''', re.I+re.M+re.S)
+                before, content, after = blockpat.split(self.banner['initrwgt'])
+                header_rwgt_other = before + after
+                pattern = re.compile('<weight id=\'(?:rwgt_(?P<id>\\d+)|(?P<id2>[_\\w\\-\\.]+))(?P<rwgttype>\\s*|_\\w+)\'>(?P<info>.*?)</weight>', re.S+re.I+re.M)
+                mg_rwgt_info = pattern.findall(content)
+                maxid = 0
+                for k,(i, fulltag, nlotype, diff) in enumerate(mg_rwgt_info):
+                    if i:
+                        if int(i) > maxid:
+                            maxid = int(i)
+                        mg_rwgt_info[k] = (i, nlotype, diff) # remove the pointless fulltag tag
+                    else:
+                        mg_rwgt_info[k] = (fulltag, nlotype, diff) # remove the pointless id tag
+                        
+                maxid += 1
+                rewgtid = maxid
+                if self.options['rwgt_name']:
+                    #ensure that the entry is not already define if so overwrites it
+                    for (i, nlotype, diff) in mg_rwgt_info[:]:
+                        for flag in type_rwgt:
+                            if 'rwgt_%s' % i == '%s%s' %(self.options['rwgt_name'],flag) or \
+                                i == '%s%s' % (self.options['rwgt_name'], flag):
+                                    logger.warning("tag %s%s already defines, will replace it", self.options['rwgt_name'],flag)
+                                    mg_rwgt_info.remove((i, nlotype, diff))
+                                                
+            else:
+                header_rwgt_other = self.banner['initrwgt'] 
+                mg_rwgt_info = []
+                rewgtid = 1
+        else:
+            self.banner['initrwgt']  = ''
+            header_rwgt_other = ''
+            mg_rwgt_info = []
+            rewgtid = 1
+
+        # add the reweighting in the banner information:
+        #starts by computing the difference in the cards.
+        #s_orig = self.banner['slha']
+        #self.orig_param_card_text = s_orig
+        s_new = new_card
+        self.new_param_card = check_param_card.ParamCard(s_new.splitlines())
+        
+        #define tag for the run
+        if self.options['rwgt_name']:
+            tag = self.options['rwgt_name']
+        else:
+            tag = str(rewgtid)
+
+        if 'rwgt_info' in self.options and self.options['rwgt_info']:
+            card_diff = self.options['rwgt_info']
+            for name in type_rwgt:
+                mg_rwgt_info.append((tag, name, self.options['rwgt_info']))
+        elif not self.second_model and not self.dedicated_path:
+            old_param = self.param_card
+            new_param =  self.new_param_card
+            card_diff = old_param.create_diff(new_param)
+            if card_diff == '' and not self.second_process:
+                    logger.warning(' REWEIGHTING: original card and new card are identical.')
+            try:
+                if old_param['sminputs'].get(3)- new_param['sminputs'].get(3) > 1e-3 * new_param['sminputs'].get(3):
+                    logger.warning("We found different value of alpha_s. Note that the value of alpha_s used is the one associate with the event and not the one from the cards.")
+            except Exception as error:
+                logger.debug("error in check of alphas: %s" % str(error))
+                pass #this is a security                
+            if not self.second_process:
+                for name in type_rwgt:
+                    mg_rwgt_info.append((tag, name, card_diff))
+            else:
+                str_proc = "\n change process  ".join([""]+self.second_process)
+                for name in type_rwgt:
+                    mg_rwgt_info.append((tag, name, str_proc + '\n'+ card_diff))
+        else:
+            if self.second_model:
+                str_info = "change model %s" % self.second_model
+            else:
+                str_info =''
+            if self.second_process:
+                str_info += "\n change process  ".join([""]+self.second_process)
+            if self.dedicated_path:
+                for k,v in self.dedicated_path.items():
+                    str_info += "\n change %s %s" % (k,v)
+            card_diff = str_info
+            str_info += '\n' + s_new
+            for name in type_rwgt:
+                mg_rwgt_info.append((tag, name, str_info))
+
+        # re-create the banner.
+        self.banner['initrwgt'] = header_rwgt_other
+        if self.output_type == 'default':
+            self.banner['initrwgt'] += '\n<weightgroup name=\'mg_reweighting\' weight_name_strategy=\'includeIdInWeightName\'>\n'
+        else:
+            self.banner['initrwgt'] += '\n<weightgroup name=\'main\'>\n'
+        for tag, rwgttype, diff in mg_rwgt_info:
+            if self.inc_sudakov:
+                try:
+                    sud_order = int(rwgttype[-1]) -1
+                    sud_order = '10' +rwgttype[-2:]
+                    self.banner['initrwgt'] += '<weight id=\'%s\'>%sscale_%s_sud</weight>\n' % \
+                            (rwgttype, diff, sud_order)
+                except IndexError:
+                    logger.critical('This is a reweighted event file! Do not reweight with ewsudakov twice')
+                    sys.exit(1)
+            else:
+                if tag.isdigit():
+                    self.banner['initrwgt'] += '<weight id=\'rwgt_%s%s\'>%s</weight>\n' % \
+                                    (tag, rwgttype, diff)
+                else:
+                    self.banner['initrwgt'] += '<weight id=\'%s%s\'>%s</weight>\n' % \
+                                    (tag, rwgttype, diff)
+        self.banner['initrwgt'] += '\n</weightgroup>\n'
+        self.banner['initrwgt'] = self.banner['initrwgt'].replace('\n\n', '\n')
+
+        #logger.info('starts to compute weight for events with the following modification to the param_card:')
+        #logger.info(card_diff.replace('\n','\nKEEP:'))
+        try:
+            self.run_card = banner.Banner(self.banner).charge_card('run_card')
+        except Exception:
+            logger.debug('no run card found -- reweight interface')
+            self.run_card = None
+
+        if self.options['rwgt_name']:
+            tag_name = self.options['rwgt_name']
+        else:
+            tag_name = 'rwgt_%s' % rewgtid
+
+        self.reweight_card.append(card_diff)
+        
+        return param_card_iterator, tag_name
+    
+    def write_reweight_card(self,rwgt_path):
+        """function for collecting all the reweight iterations from the parsed reweight card
+        and write it out with the explicit 'set BLOCK PARAM VALUE' format"""
+        if( len(self.reweight_names) != len(self.reweight_card) ):
+            raise Exception('Mismatch in number of reweight names and reweight cards')
+        
+        output_card = ''
+        
+        for i, card in enumerate(self.reweight_card):
+            output_card += 'launch --rwgt_name=%s\n' % self.reweight_names[i]
+            output_card += card + '\n'
+        
+        output_card = output_card.replace('param_card', '').replace('  ', ' ')
+        
+        with open(rwgt_path, 'w') as f:
+            f.write(output_card)
+        
+        return
+    
+    def do_quit(self, line):
+        if self.exitted:
+            return
+        try:
+            self.launch_actual_reweighting()
+        except:
+            raise Exception("Error in tRex reweighting. Exiting.")
+        
+        self.exitted = True
+        
+        if 'init' in self.banner:
+            cross = 0 
+            error = 0
+            for line in self.banner['init'].split('\n'):
+                split = line.split()
+                if len(split) == 4:
+                    cross += float(split[0])
+                    error += float(split[1])**2
+            error = error**0.5
+        if not self.multicore == 'create':
+            # No print of results for the multicore mode for the one printed on screen
+            if 'orig' not in self.all_cross_section:
+                logger.info('Original cross-section: %s +- %s pb' % (cross, error))
+            else: 
+                logger.info('Original cross-section: %s +- %s pb (cross-section from sum of weights: %s)' % (cross, error, self.all_cross_section['orig'][0]))
+            logger.info('Computed cross-section:')
+            keys = list(self.all_cross_section.keys())
+            keys.sort(key=lambda x: str(x))
+            for key in keys:
+                if key == 'orig':
+                    continue
+                logger.info('%s : %s +- %s pb' % (key[0] if not key[1] else '%s%s' % key,
+                    self.all_cross_section[key][0],self.all_cross_section[key][1] ))  
+        self.terminate_fortran_executables()
+
+        if self.rwgt_dir and self.multicore == False:
+            self.save_to_pickle()
+        
+        with misc.stdchannel_redirected(sys.stdout, os.devnull):
+            for run_id in self.calculator:
+                del self.calculator[run_id]
+            del self.calculator
\ No newline at end of file

From b81c14dd7bfb6c6640e43f8b1178d57ba470a5cf Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 09:05:36 +0100
Subject: [PATCH 02/33] Integrate CUDACPP with MadSpace for mg7 output mode

---
 PLUGIN/CUDACPP_OUTPUT/__init__.py             |    3 +
 .../iolibs/template_files/mg7/api.cpp         |  222 ++++
 .../madgraph/iolibs/template_files/mg7/api.h  |  209 ++++
 .../iolibs/template_files/mg7/generate_events |   10 +
 .../iolibs/template_files/mg7/madevent.py     | 1037 +++++++++++++++++
 .../iolibs/template_files/mg7/run_card.toml   |  124 ++
 .../iolibs/template_files/mg7/train_madnis.py |  124 ++
 PLUGIN/CUDACPP_OUTPUT/model_handling.py       |   26 +-
 PLUGIN/CUDACPP_OUTPUT/output.py               |   70 +-
 madgraph/iolibs/export_cpp.py                 |   67 +-
 madgraph/iolibs/export_mg7.py                 |    4 +-
 11 files changed, 1824 insertions(+), 72 deletions(-)
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.cpp
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.h
 create mode 100755 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/generate_events
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
 create mode 100644 PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/train_madnis.py

diff --git a/PLUGIN/CUDACPP_OUTPUT/__init__.py b/PLUGIN/CUDACPP_OUTPUT/__init__.py
index 381a7805fb..8428fa551c 100644
--- a/PLUGIN/CUDACPP_OUTPUT/__init__.py
+++ b/PLUGIN/CUDACPP_OUTPUT/__init__.py
@@ -42,6 +42,9 @@
                    'madevent_gpu' : output.GPU_ProcessExporter,
                    'standalone_cudacpp' : output.PLUGIN_ProcessExporter,
                    'standalone_trex' : trex.TREX_ProcessExporter,
+                   'mg7_simd' :  output.MG7_SIMD_ProcessExporter,
+                   'mg7_cuda' :  output.MG7_CUDA_ProcessExporter,
+                   'mg7_hip' :  output.MG7_HIP_ProcessExporter,
                    # the following one are used for the second exporter class 
                    # (not really needed so far but interesting if need
                    #  specialization in the futur) 
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.cpp b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.cpp
new file mode 100644
index 0000000000..733229231b
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.cpp
@@ -0,0 +1,222 @@
+#include "CPPProcess.h"
+#include "api.h"
+#include <vector>
+
+extern "C" {
+
+UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result) {
+    switch (meta_key) {
+    case UMAMI_META_DEVICE: {
+        *static_cast<UmamiDevice*>(result) = UMAMI_DEVICE_CPU;
+        break;
+    } case UMAMI_META_PARTICLE_COUNT:
+        *static_cast<int*>(result) = CPPProcess::nexternal;
+        break;
+    case UMAMI_META_DIAGRAM_COUNT:
+        *static_cast<int*>(result) = CPPProcess::ndiagrams;
+        break;
+    case UMAMI_META_HELICITY_COUNT:
+        *static_cast<int*>(result) = CPPProcess::ncomb;
+        break;
+    case UMAMI_META_COLOR_COUNT:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    default:
+        return UMAMI_ERROR_UNSUPPORTED_META;
+    }
+    return UMAMI_SUCCESS;
+}
+
+UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path) {
+    CPPProcess* process = new CPPProcess(param_card_path);
+    std::vector<double*>& momenta = process->getMomenta();
+    for (int i = 0; i < CPPProcess::nexternal; ++i) momenta.push_back(new double[4]());
+    *handle = process;
+    return UMAMI_SUCCESS;
+}
+
+
+UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag
+) {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+}
+
+
+UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag
+) {
+    return UMAMI_ERROR_NOT_IMPLEMENTED;
+}
+
+UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs
+) {
+    const double* momenta_in = nullptr;
+    const double* alpha_s_in = nullptr;
+    const int* flavor_in = nullptr;
+    const double* random_color_in = nullptr;
+    const double* random_helicity_in = nullptr;
+    const double* random_diagram_in = nullptr;
+
+    for (std::size_t i = 0; i < input_count; ++i) {
+        const void* input = inputs[i];
+        switch (input_keys[i]) {
+        case UMAMI_IN_MOMENTA:
+            momenta_in = static_cast<const double*>(input);
+            break;
+        case UMAMI_IN_ALPHA_S:
+            alpha_s_in = static_cast<const double*>(input);
+            break;
+        case UMAMI_IN_FLAVOR_INDEX:
+            flavor_in = static_cast<const int*>(input);
+            break;
+        case UMAMI_IN_RANDOM_COLOR:
+            random_color_in = static_cast<const double*>(input);
+            break;
+        case UMAMI_IN_RANDOM_HELICITY:
+            random_helicity_in = static_cast<const double*>(input);
+            break;
+        case UMAMI_IN_RANDOM_DIAGRAM:
+            random_diagram_in = static_cast<const double*>(input);
+            break;
+        default:
+            return UMAMI_ERROR_UNSUPPORTED_INPUT;
+        }
+    }
+    if (!momenta_in) return UMAMI_ERROR_MISSING_INPUT;
+
+    double* m2_out = nullptr;
+    double* amp2_out = nullptr;
+    int* diagram_out = nullptr;
+    int* color_out = nullptr;
+    int* helicity_out = nullptr;
+    for (std::size_t i = 0; i < output_count; ++i) {
+        void* output = outputs[i];
+        switch (output_keys[i]) {
+        case UMAMI_OUT_MATRIX_ELEMENT:
+            m2_out = static_cast<double*>(output);
+            break;
+        case UMAMI_OUT_DIAGRAM_AMP2:
+            amp2_out = static_cast<double*>(output);
+            break;
+        case UMAMI_OUT_COLOR_INDEX:
+            color_out = static_cast<int*>(output);
+            break;
+        case UMAMI_OUT_HELICITY_INDEX:
+            helicity_out = static_cast<int*>(output);
+            break;
+        case UMAMI_OUT_DIAGRAM_INDEX:
+            diagram_out = static_cast<int*>(output);
+            break;
+        default:
+            return UMAMI_ERROR_UNSUPPORTED_OUTPUT;
+        }
+    }
+
+    CPPProcess* process = static_cast<CPPProcess*>(handle);
+
+    std::vector<double*>& process_momenta = process->getMomenta();
+    for (size_t i_batch = 0; i_batch < count; ++i_batch) {
+        for (size_t i_part = 0; i_part < CPPProcess::nexternal; ++i_part) {
+            for(size_t i_mom = 0; i_mom < 4; ++i_mom) {
+                process_momenta[i_part][i_mom] =
+                    momenta_in[stride * (CPPProcess::nexternal * i_mom + i_part) + i_batch];
+            }
+        }
+        process->getParameters().aS = alpha_s_in ? alpha_s_in[i_batch] : 0.118;
+        double m2 = process->sigmaKin(flavor_in[i_batch]);
+        if (m2_out) m2_out[i_batch] = m2;
+        if (color_out) color_out[i_batch] = 0;
+        if (diagram_out) diagram_out[i_batch] = 0;
+        if (helicity_out) helicity_out[i_batch] = 0;
+        if (amp2_out) {
+            double chan_total = 0.;
+            const double* amp2 = process->getAmp2();
+            for(size_t i_amp = 0; i_amp < CPPProcess::ndiagrams; ++i_amp) {
+                double amp2_item = amp2[i_amp];
+                amp2_out[i_amp * stride + i_batch] = amp2_item;
+                chan_total += amp2_item;
+            }
+            for(size_t i_amp = 0; i_amp < CPPProcess::ndiagrams; ++i_amp) {
+                amp2_out[i_amp * stride + i_batch] /= chan_total;
+            }
+        }
+    }
+
+    return UMAMI_SUCCESS;
+}
+
+void compute_matrix_element_multichannel(
+    void* subprocess,
+    size_t count,
+    size_t stride,
+    const double* momenta_in,
+    const double* alpha_s_in,
+    const double* random_in,
+    const int* flavor_in,
+    double* m2_out,
+    double* amp2_out,
+    int* diagram_out,
+    int* color_out,
+    int* helicity_out,
+    void* cuda_stream
+) {
+    CPPProcess* process = static_cast<CPPProcess*>(subprocess);
+
+    std::vector<double*>& process_momenta = process->getMomenta();
+    for (size_t i_batch = 0; i_batch < count; ++i_batch) {
+        for (size_t i_part = 0; i_part < CPPProcess::nexternal; ++i_part) {
+            for(size_t i_mom = 0; i_mom < 4; ++i_mom) {
+                process_momenta[i_part][i_mom] =
+                    momenta_in[stride * (CPPProcess::nexternal * i_mom + i_part) + i_batch];
+            }
+        }
+        process->getParameters().aS = alpha_s_in[i_batch];
+        m2_out[i_batch] = process->sigmaKin(flavor_in[i_batch]);
+        color_out[i_batch] = 0;
+        diagram_out[i_batch] = 0;
+        helicity_out[i_batch] = 0;
+        double chan_total = 0.;
+        const double* amp2 = process->getAmp2();
+        for(size_t i_amp = 0; i_amp < CPPProcess::ndiagrams; ++i_amp) {
+            double amp2_item = amp2[i_amp];
+            amp2_out[i_amp * stride + i_batch] = amp2_item;
+            chan_total += amp2_item;
+        }
+        for(size_t i_amp = 0; i_amp < CPPProcess::ndiagrams; ++i_amp) {
+            amp2_out[i_amp * stride + i_batch] /= chan_total;
+        }
+    }
+}
+
+void free_subprocess(void* subprocess) {
+    CPPProcess* process = static_cast<CPPProcess*>(subprocess);
+    std::vector<double*>& momenta = process->getMomenta();
+    for (int i = 0; i < CPPProcess::nexternal; ++i) delete[] momenta[i];
+    delete process;
+}
+
+UmamiStatus umami_free(UmamiHandle handle) {
+    CPPProcess* process = static_cast<CPPProcess*>(handle);
+    std::vector<double*>& momenta = process->getMomenta();
+    for (int i = 0; i < CPPProcess::nexternal; ++i) delete[] momenta[i];
+    delete process;
+    return UMAMI_SUCCESS;
+}
+
+}
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.h
new file mode 100644
index 0000000000..2152036924
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/api.h
@@ -0,0 +1,209 @@
+/*
+ *                                   _
+ *                                  (_)
+ *   _   _ _ __ ___   __ _ _ __ ___  _
+ *  | | | | '_ ` _ \ / _` | '_ ` _ \| |
+ *  | |_| | | | | | | (_| | | | | | | |
+ *   \__,_|_| |_| |_|\__,_|_| |_| |_|_|
+ *
+ *  Unified  MAtrix  eleMent  Interface
+ *
+ *
+ */
+
+#ifndef UMAMI_HEADER
+#define UMAMI_HEADER 1
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Major version number of the UMAMI interface. If the major version is the same
+ * between caller and implementation, binary compatibility is ensured.
+ */
+const int UMAMI_MAJOR_VERSION = 1;
+/**
+ * Minor version number of the UMAMI interface. Between minor versions, new keys for
+ * errors, devices, metadata, inputs and outputs can be added.
+ */
+const int UMAMI_MINOR_VERSION = 0;
+
+typedef enum {
+    UMAMI_SUCCESS,
+    UMAMI_ERROR,
+    UMAMI_ERROR_NOT_IMPLEMENTED,
+    UMAMI_ERROR_UNSUPPORTED_INPUT,
+    UMAMI_ERROR_UNSUPPORTED_OUTPUT,
+    UMAMI_ERROR_UNSUPPORTED_META,
+    UMAMI_ERROR_MISSING_INPUT,
+} UmamiStatus;
+
+typedef enum {
+    UMAMI_DEVICE_CPU,
+    UMAMI_DEVICE_CUDA,
+    UMAMI_DEVICE_HIP,
+} UmamiDevice;
+
+typedef enum {
+    UMAMI_META_DEVICE,
+    UMAMI_META_PARTICLE_COUNT,
+    UMAMI_META_DIAGRAM_COUNT,
+    UMAMI_META_HELICITY_COUNT,
+    UMAMI_META_COLOR_COUNT,
+} UmamiMetaKey;
+
+typedef enum {
+    UMAMI_IN_MOMENTA,
+    UMAMI_IN_ALPHA_S,
+    UMAMI_IN_FLAVOR_INDEX,
+    UMAMI_IN_RANDOM_COLOR,
+    UMAMI_IN_RANDOM_HELICITY,
+    UMAMI_IN_RANDOM_DIAGRAM,
+    UMAMI_IN_HELICITY_INDEX,
+    UMAMI_IN_DIAGRAM_INDEX,
+} UmamiInputKey;
+
+typedef enum {
+    UMAMI_OUT_MATRIX_ELEMENT,
+    UMAMI_OUT_DIAGRAM_AMP2,
+    UMAMI_OUT_COLOR_INDEX,
+    UMAMI_OUT_HELICITY_INDEX,
+    UMAMI_OUT_DIAGRAM_INDEX,
+    UMAMI_OUT_GPU_STREAM,
+    // NLO: born, virtual, poles, counterterms
+    // color: LC-ME, FC-ME
+} UmamiOutputKey;
+
+typedef void* UmamiHandle;
+
+
+/**
+ * Creates an instance of the matrix element. Each instance is independent, so thread
+ * safety can be achieved by creating a separate one for every thread.
+ *
+ * @param meta_key
+ *     path to the parameter file
+ * @param handle
+ *     pointer to an instance of the subprocess. Has to be cleaned up by
+ *     the caller with `free_subprocess`.
+ * @return
+ *     UMAMI_SUCCESS on success, error code otherwise
+ */
+UmamiStatus umami_get_meta(UmamiMetaKey meta_key, void* result);
+
+/**
+ * Creates an instance of the matrix element. Each instance is independent, so thread
+ * safety can be achieved by creating a separate one for every thread.
+ *
+ * @param param_card_path
+ *     path to the parameter file
+ * @param handle
+ *     pointer to an instance of the subprocess. Has to be cleaned up by
+ *     the caller with `free_subprocess`.
+ * @return
+ *     UMAMI_SUCCESS on success, error code otherwise
+ */
+UmamiStatus umami_initialize(UmamiHandle* handle, char const* param_card_path);
+
+/**
+ * Sets the value of a model parameter
+ *
+ * @param handle
+ *     handle of a matrix element instance
+ * @param name
+ *     name of the parameter
+ * @param parameter_real
+ *     real part of the parameter value
+ * @param parameter_imag
+ *     imaginary part of the parameter value. Ignored for real valued parameters.
+ * @return
+ *     UMAMI_SUCCESS on success, error code otherwise
+ */
+UmamiStatus umami_set_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double parameter_real,
+    double parameter_imag
+);
+
+/**
+ * Retrieves the value of a model parameter
+ *
+ * @param handle
+ *     handle of a matrix element instance
+ * @param name
+ *     name of the parameter
+ * @param parameter_real
+ *     pointer to double to return real part of the parameter value
+ * @param parameter_imag
+ *     pointer to double to return imaginary part of the parameter value. Ignored
+ *     for real-valued parameters (i.e. you may pass a null pointer)
+ * @return
+ *     UMAMI_SUCCESS on success, error code otherwise
+ */
+UmamiStatus umami_get_parameter(
+    UmamiHandle handle,
+    char const* name,
+    double* parameter_real,
+    double* parameter_imag
+);
+
+/**
+ * Evaluates the matrix element as a function of the given inputs, filling the
+ * requested outputs.
+ *
+ * @param handle
+ *     handle of a matrix element instance
+ * @param count
+ *     number of events to evaluate the matrix element for
+ * @param stride
+ *     stride of the batch dimension of the input and output arrays, see memory layout
+ * @param offset
+ *     offset of the event index
+ * @param input_count
+ *     number of inputs to the matrix element
+ * @param input_keys
+ *     pointer to an array of input keys, length `input_count`
+ * @param inputs
+ *     pointer to an array of void pointers to the inputs. The type of the inputs
+ *     depends on the input key
+ * @param output_count
+ *     number of outputs to the matrix element
+ * @param output_keys
+ *     pointer to an array of output keys, length `output_count`
+ * @param outputs
+ *     pointer to an array of void pointers to the outputs. The type of the outputs
+ *     depends on the output key. The caller is responsible for allocating memory for
+ *     the outputs.
+ * @return
+ *     UMAMI_SUCCESS on success, error code otherwise
+ */
+UmamiStatus umami_matrix_element(
+    UmamiHandle handle,
+    size_t count,
+    size_t stride,
+    size_t offset,
+    size_t input_count,
+    UmamiInputKey const* input_keys,
+    void const* const* inputs,
+    size_t output_count,
+    UmamiOutputKey const* output_keys,
+    void* const* outputs
+);
+
+/**
+ * Frees matrix element instance
+ *
+ * @param handle
+ *     handle of a matrix element instance
+ */
+UmamiStatus umami_free(UmamiHandle handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UMAMI_HEADER
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/generate_events b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/generate_events
new file mode 100755
index 0000000000..57316e48a7
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/generate_events
@@ -0,0 +1,10 @@
+#! /usr/bin/env python3
+import sys, os
+from internal.madevent import main
+
+if __name__ == '__main__':
+    os.chdir(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+    try:
+        main()
+    except KeyboardInterrupt:
+        pass
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
new file mode 100644
index 0000000000..a02c53af81
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
@@ -0,0 +1,1037 @@
+import argparse
+import os
+import time
+from datetime import timedelta
+import glob
+import json
+import subprocess
+import logging
+from dataclasses import dataclass
+from typing import Literal, NamedTuple
+try:
+    import tomllib
+except ModuleNotFoundError:
+    # for versions before 3.11
+    import pip._vendor.tomli as tomllib
+
+if "LHAPDF_DATA_PATH" in os.environ:
+    PDF_PATH = os.environ["LHAPDF_DATA_PATH"]
+else:
+    try:
+        import lhapdf
+        lhapdf.setVerbosity(0)
+        PDF_PATH = lhapdf.paths()[0]
+    except ImportError:
+        raise RuntimeError("Can't load lhapdf module. Please set LHAPDF_DATA_PATH manually")
+
+import madspace as ms
+from models.check_param_card import ParamCard
+
+logger = logging.getLogger("madevent7")
+
+
+def get_start_time():
+    return time.time(), time.process_time()
+
+
+def print_run_time(start):
+    start_time, start_cpu_time = start
+    train_time = time.time() - start_time
+    train_cpu_time = time.process_time() - start_cpu_time
+    print(
+        f"--- Run time: {str(timedelta(seconds=round(train_time, 2) + 1e-5))[:-4]} wall time, "
+        f"{str(timedelta(seconds=round(train_cpu_time, 2) + 1e-5))[:-4]} cpu time ---\n"
+    )
+
+
+@dataclass
+class Channel:
+    phasespace_mapping: ms.PhaseSpaceMapping
+    adaptive_mapping: ms.Flow | ms.VegasMapping
+    discrete_before: ms.DiscreteSampler | ms.DiscreteFlow | None
+    discrete_after: ms.DiscreteSampler | ms.DiscreteFlow | None
+    channel_weight_indices: list[int] | None
+    name: str
+
+
+@dataclass
+class PhaseSpace:
+    mode: Literal["multichannel", "flat", "both"]
+    channels: list[Channel]
+    symfact: list[int | None]
+    chan_weight_remap: list[int]
+    prop_chan_weights: ms.PropagatorChannelWeights | None = None
+    subchan_weights: ms.SubchannelWeights | None = None
+    cwnet: ms.ChannelWeightNetwork | None = None
+
+
+class MultiChannelData(NamedTuple):
+    amp2_remap: list[int]
+    symfact: list[int | None]
+    topologies: list[list[ms.Topology]]
+    permutations: list[list[list[int]]]
+    channel_indices: list[list[int]]
+    channel_weight_indices: list[list[list[int]]]
+    diagram_indices: list[list[int]]
+    diagram_color_indices: list[list[list[int]]]
+
+
+@dataclass
+class CutItem:
+    observable_kwargs: dict
+    min: float
+    max: float
+    mode: str
+
+
+@dataclass
+class HistItem:
+    observable_kwargs: dict
+    min: float
+    max: float
+    bin_count: int
+
+
+class MadgraphProcess:
+    def __init__(self):
+        self.load_cards()
+        self.init_backend()
+        self.init_event_dir()
+        self.init_context()
+        self.init_cuts()
+        self.init_histograms()
+        self.init_generator_config()
+        self.init_beam()
+        self.init_subprocesses()
+
+    def load_cards(self) -> None:
+        with open(os.path.join("Cards", "run_card.toml"), "rb") as f:
+            self.run_card = tomllib.load(f)
+        self.param_card_path = os.path.join("Cards", "param_card.dat")
+        self.param_card = ParamCard(self.param_card_path)
+        with open(os.path.join("SubProcesses", "subprocesses.json")) as f:
+            self.subprocess_data = json.load(f)
+
+    def init_backend(self) -> None:
+        ms.set_simd_vector_size(self.run_card["run"]["simd_vector_size"])
+        ms.set_thread_count(self.run_card["run"]["thread_pool_size"])
+
+    def init_event_dir(self) -> None:
+        run_name = self.run_card["run"]["run_name"]
+        os.makedirs("Events", exist_ok=True)
+        existing_run_dirs = glob.glob(f"Events/{run_name}_*")
+        run_index = 1
+        while f"Events/{run_name}_{run_index:02d}" in existing_run_dirs:
+            run_index += 1
+        while True:
+            try:
+                self.run_path = f"Events/{run_name}_{run_index:02d}"
+                os.mkdir(self.run_path)
+                break
+            except FileExistsError:
+                run_index += 1
+
+    def parse_observable(self, name: str, order_observable: str) -> dict:
+        parts = name.split("-")
+        sum_momenta = False
+        sum_observable = False
+        ordered = False
+        multiparticles = self.run_card["multiparticles"]
+
+        if len(parts) == 0:
+            raise ValueError("Invalid observable name")
+        elif len(parts) == 1:
+            # event-level observables
+            obs_name = parts[0]
+            select_pids = []
+        else:
+            if parts[-1] == "sum":
+                sum_observable = True
+                obs_name = parts[-2]
+                selection = parts[:-2]
+            elif parts[-2] == "sum":
+                sum_momenta = True
+                obs_name = parts[-1]
+                selection = parts[:-2]
+            else:
+                obs_name = parts[-1]
+                selection = parts[:-1]
+            select_pids = []
+            order_indices = []
+            for mp_name in selection:
+                mp_parts = mp_name.split("_")
+                if mp_parts[-1].isnumeric():
+                    order_indices.append(int(mp_parts[-1]))
+                    select_pids.append(multiparticles["_".join(mp_parts[:-1])])
+                    ordered = True
+                else:
+                    order_indices.append(0)
+                    select_pids.append(multiparticles[mp_name])
+
+        return dict(
+            observable=obs_name,
+            select_pids=select_pids,
+            sum_momenta=sum_momenta,
+            sum_observable=sum_observable,
+            order_observable=order_observable if ordered else None,
+            order_indices=order_indices if ordered else [],
+            ignore_incoming=True,
+            name=name,
+        )
+
+    def init_cuts(self) -> None:
+        inf = float("inf")
+        order_observable = self.run_card["cuts"].get("order_by", "pt")
+        self.cut_data = [
+            CutItem(
+                observable_kwargs=self.parse_observable(key, order_observable),
+                min=values.get("min", -inf),
+                max=values.get("max", inf),
+                mode=values.get("mode", "all"),
+            )
+            for key, values in self.run_card["cuts"].items()
+            if key != "order_by"
+        ]
+
+    def init_histograms(self) -> None:
+        inf = float("inf")
+        order_observable = self.run_card["histograms"].get("order_by", "pt")
+        #TODO: add reasonable defaults for min, max, bin_count
+        self.hist_data = [
+            HistItem(
+                observable_kwargs=self.parse_observable(key, order_observable),
+                min=values["min"],
+                max=values["max"],
+                bin_count=values["bin_count"],
+            )
+            for key, values in self.run_card["histograms"].items()
+            if key != "order_by"
+        ]
+
+    def init_beam(self) -> None:
+        beam_args = self.run_card["beam"]
+
+        self.e_cm = beam_args["e_cm"]
+        self.leptonic = beam_args["leptonic"]
+
+        dynamical_scales = {
+            "transverse_energy": ms.EnergyScale.transverse_energy,
+            "transverse_mass": ms.EnergyScale.transverse_mass,
+            "half_transverse_mass": ms.EnergyScale.half_transverse_mass,
+            "partonic_energy": ms.EnergyScale.partonic_energy,
+        }
+        if beam_args["dynamical_scale_choice"] in dynamical_scales:
+            dynamical_scale_type = dynamical_scales[beam_args["dynamical_scale_choice"]]
+        else:
+            raise ValueError("Unknown dynamical scale choice")
+        self.scale_kwargs = dict(
+            dynamical_scale_type=dynamical_scale_type,
+            ren_scale_fixed=beam_args["fixed_ren_scale"],
+            fact_scale_fixed=beam_args["fixed_fact_scale"],
+            ren_scale=beam_args["ren_scale"],
+            fact_scale1=beam_args["fact_scale1"],
+            fact_scale2=beam_args["fact_scale2"],
+        )
+
+        pdf_set = beam_args["pdf"]
+        self.pdf_grid = ms.PdfGrid(os.path.join(PDF_PATH, pdf_set, f"{pdf_set}_0000.dat"))
+        self.pdf_grid.initialize_globals(self.context)
+        self.alphas_grid = ms.AlphaSGrid(os.path.join(PDF_PATH, pdf_set, f"{pdf_set}.info"))
+        self.alphas_grid.initialize_globals(self.context)
+        self.running_coupling = ms.RunningCoupling(self.alphas_grid)
+
+    def init_generator_config(self) -> None:
+        run_args = self.run_card["run"]
+        gen_args = self.run_card["generation"]
+        vegas_args = self.run_card["vegas"]
+        cfg = ms.EventGeneratorConfig()
+        cfg.target_count = gen_args["events"]
+        cfg.vegas_damping = vegas_args["damping"]
+        cfg.max_overweight_truncation = gen_args["max_overweight_truncation"]
+        cfg.freeze_max_weight_after = gen_args["freeze_max_weight_after"]
+        cfg.start_batch_size = vegas_args["start_batch_size"]
+        cfg.max_batch_size = vegas_args["max_batch_size"]
+        cfg.survey_min_iters = gen_args["survey_min_iters"]
+        cfg.survey_max_iters = gen_args["survey_max_iters"]
+        cfg.survey_target_precision = gen_args["survey_target_precision"]
+        cfg.optimization_patience = vegas_args["optimization_patience"]
+        cfg.optimization_threshold = vegas_args["optimization_threshold"]
+        cfg.batch_size = gen_args["batch_size"]
+        cfg.verbosity = run_args["verbosity"]
+        self.event_generator_config = cfg
+        self.event_generator = None
+
+    def init_context(self) -> None:
+        device_name = self.run_card["run"]["device"]
+        if device_name == "cpu":
+            device = ms.cpu_device()
+        elif device_name == "cuda":
+            device = ms.cuda_device()
+        elif device_name == "hip":
+            device = ms.hip_device()
+        else:
+            raise ValueError("Unknown device")
+        self.context = ms.Context(device)
+
+    def init_subprocesses(self) -> None:
+        self.subprocesses = []
+        for subproc_id, meta in enumerate(self.subprocess_data):
+            self.subprocesses.append(MadgraphSubprocess(self, meta, subproc_id))
+
+    def build_event_generator(
+        self, phasespaces: list[PhaseSpace], file: str
+    ) -> ms.EventGenerator:
+        integrands = []
+        subproc_ids = []
+        channel_names = []
+        channel_hists = []
+        for i, (subproc, phasespace) in enumerate(zip(self.subprocesses, phasespaces)):
+            subproc_integrands = subproc.build_integrands(phasespace)
+            integrands.extend(subproc.build_integrands(phasespace))
+            subproc_ids.extend([i] * len(phasespace.channels))
+            channel_names.extend([f"{i}.{chan.name}" for chan in phasespace.channels])
+            if subproc.histograms is not None:
+                channel_hists.extend([subproc.histograms] * len(phasespace.channels))
+        #print(integrands[0].function())
+        #integrands[0].function().save("test.json")
+        #integrands[0] = ms.Function.load("test.json")
+        return ms.EventGenerator(
+            context=self.context,
+            channels=integrands,
+            temp_file_prefix=os.path.join(self.run_path, file),
+            status_file=os.path.join(self.run_path, "info.json"),
+            config=self.event_generator_config,
+            channel_subprocesses=subproc_ids,
+            channel_names=channel_names,
+            channel_histograms=channel_hists,
+        )
+
+    def survey_phasespaces(
+        self, phasespaces: list[PhaseSpace], mode: str | None = None
+    ) -> ms.EventGenerator:
+        event_generator = self.build_event_generator(
+            phasespaces, "events" if mode is None else f"events_{mode}"
+        )
+
+        print()
+        event_generator.survey()
+        return event_generator
+
+    def survey(self) -> None:
+        phasespace_mode = self.run_card["phasespace"]["mode"]
+        if phasespace_mode == "multichannel":
+            self.phasespaces = [
+                subproc.build_multichannel_phasespace()
+                for subproc in self.subprocesses
+            ]
+            self.event_generator = self.survey_phasespaces(self.phasespaces)
+        elif phasespace_mode == "flat":
+            self.phasespaces = [
+                subproc.build_flat_phasespace()
+                for subproc in self.subprocesses
+            ]
+            self.event_generator = self.survey_phasespaces(self.phasespaces)
+        elif phasespace_mode == "both":
+            phasespaces_multi = [
+                subproc.build_multichannel_phasespace()
+                for subproc in self.subprocesses
+            ]
+            evgen_multi = self.survey_phasespaces(phasespaces_multi, "multichannel")
+
+            phasespaces_flat = [
+                subproc.build_flat_phasespace()
+                for subproc in self.subprocesses
+            ]
+            evgen_flat = self.survey_phasespaces(phasespaces_flat, "flat")
+
+            channel_status = evgen_multi.channel_status()
+            cross_sections = []
+            index = 0
+            for phasespace in phasespaces_multi:
+                channel_count = len(phasespace.channels)
+                cross_sections.append([
+                    status.mean
+                    for status in channel_status[index:index + channel_count]
+                ])
+                index += channel_count
+
+            self.phasespaces = [
+                subproc.simplify_phasespace(ps_multi, ps_flat, cross_secs)
+                for subproc, ps_multi, ps_flat, cross_secs in zip(
+                    self.subprocesses, phasespaces_multi, phasespaces_flat, cross_sections
+                )
+            ]
+
+            if not self.run_card["madnis"]["enable"]:
+                self.event_generator = self.build_event_generator(self.phasespaces, "events")
+                #TODO: avoid to run survey again
+                self.event_generator.survey()
+        else:
+            raise ValueError("Unknown phasespace mode")
+
+    def train_madnis(self) -> None:
+        madnis_args = self.run_card["madnis"]
+        if not madnis_args["enable"]:
+            return
+
+        madnis_phasespaces = []
+        for subproc, phasespace in zip(self.subprocesses, self.phasespaces):
+            phasespace = subproc.build_madnis(phasespace)
+            subproc.train_madnis(phasespace)
+            madnis_phasespaces.append(phasespace)
+        self.phasespaces = madnis_phasespaces
+        self.event_generator = self.build_event_generator(madnis_phasespaces, "events")
+        self.event_generator.survey() #TODO: avoid
+
+    def generate_events(self) -> None:
+        start_time = get_start_time()
+        self.event_generator.generate()
+        output_format = self.run_card["run"]["output_format"]
+        if output_format == "compact_npy":
+            self.event_generator.combine_to_compact_npy(
+                os.path.join(self.run_path, "events.npy")
+            )
+        elif output_format == "lhe_npy":
+            lhe_completer = self.build_lhe_completer()
+            self.event_generator.combine_to_lhe_npy(
+                os.path.join(self.run_path, "events.npy"), lhe_completer
+            )
+        elif output_format == "lhe":
+            lhe_completer = self.build_lhe_completer()
+            self.event_generator.combine_to_lhe(
+                os.path.join(self.run_path, "events.lhe"), lhe_completer
+            )
+        else:
+            raise ValueError("Unknown output format")
+        self.save_gridpack()
+
+    def build_lhe_completer(self):
+        subproc_args = []
+        for subproc, meta in zip(self.subprocesses, self.subprocess_data):
+            (
+                _,
+                _,
+                topologies,
+                permutations,
+                _,
+                _,
+                diagram_indices,
+                diagram_color_indices,
+            ) = subproc.build_multi_channel_data()
+            subproc_args.append(
+                ms.SubprocArgs(
+                    topologies = [topo[0] for topo in topologies],
+                    permutations = permutations,
+                    diagram_indices = diagram_indices,
+                    diagram_color_indices = diagram_color_indices,
+                    color_flows = meta["color_flows"],
+                    pdg_color_types = {
+                        int(key): value
+                        for key, value in meta["pdg_color_types"].items()
+                    },
+                    helicities = meta["helicities"],
+                    pdg_ids = [flavor["options"] for flavor in meta["flavors"]],
+                    matrix_flavor_indices = [
+                        flavor["index"] for flavor in meta["flavors"]
+                    ],
+                )
+            )
+        return ms.LHECompleter(
+            subproc_args=subproc_args,
+            bw_cutoff=self.run_card["phasespace"]["bw_cutoff"]
+        )
+
+    def save_gridpack(self) -> None:
+        gridpack_path = os.path.join(self.run_path, "gridpack")
+        os.mkdir(gridpack_path)
+        self.context.save(os.path.join(gridpack_path, "context.json"))
+
+    def get_mass(self, pid: int) -> float:
+        return self.param_card.get_value("mass", pid)
+
+    def get_width(self, pid: int) -> float:
+        return self.param_card.get_value("width", pid)
+
+
+def clean_pids(pids: list[int]) -> list[int]:
+    pids_out = []
+    for pid in pids:
+        pid = abs(pid)
+        if pid == 81:
+            pid = 1
+        if pid == 82:
+            pid = 11
+        pids_out.append(pid)
+    return pids_out
+
+
+class MadgraphSubprocess:
+    def __init__(self, process: MadgraphProcess, meta: dict, subproc_id: int):
+        self.process = process
+        self.meta = meta
+        self.subproc_id = subproc_id
+        self.multi_channel_data = None
+
+        api_path = self.meta["path"]
+        if not os.path.isfile(api_path):
+            cwd = os.getcwd()
+            api_dir = os.path.dirname(api_path)
+            logger.info(f"Compiling subprocess {api_dir}")
+            os.chdir(api_dir)
+            subprocess.run(["make"])
+            os.chdir(cwd)
+
+        self.incoming_masses = [
+            self.process.get_mass(pid) for pid in clean_pids(self.meta["incoming"])
+        ]
+        self.outgoing_masses = [
+            self.process.get_mass(pid) for pid in clean_pids(self.meta["outgoing"])
+        ]
+        self.particle_count = len(self.incoming_masses) + len(self.outgoing_masses)
+        all_pids = clean_pids(self.meta["incoming"]) + clean_pids(self.meta["outgoing"])
+        self.cuts = (
+            ms.Cuts([
+                ms.CutItem(
+                    observable=ms.Observable(all_pids, **cut_item.observable_kwargs),
+                    min=cut_item.min,
+                    max=cut_item.max,
+                    mode=cut_item.mode,
+                )
+                for cut_item in self.process.cut_data
+            ])
+            if len(self.process.cut_data) > 0
+            else None
+        )
+        self.histograms = (
+            ms.ObservableHistograms([
+                ms.HistItem(
+                    observable=ms.Observable(all_pids, **hist_item.observable_kwargs),
+                    min=hist_item.min,
+                    max=hist_item.max,
+                    bin_count=hist_item.bin_count,
+                )
+                for hist_item in self.process.hist_data
+            ])
+            if len(self.process.hist_data) > 0
+            else None
+        )
+
+        self.scale = ms.EnergyScale(
+            particle_count=self.particle_count, **self.process.scale_kwargs
+        )
+
+        if self.process.run_card["run"]["dummy_matrix_element"]:
+            self.matrix_element = None
+        else: 
+            self.matrix_element = self.process.context.load_matrix_element(
+                api_path, self.process.param_card_path
+            )
+
+    def build_multi_channel_data(self) -> MultiChannelData:
+        if self.multi_channel_data is not None:
+            return self.multi_channel_data
+
+        diagram_count = self.meta["diagram_count"]
+        bw_cutoff = self.process.run_card["phasespace"]["bw_cutoff"]
+
+        amp2_remap = [-1] * diagram_count
+        symfact = []
+        topologies = []
+        permutations = []
+        channel_indices = []
+        channel_weight_indices = []
+        diagram_indices = []
+        diagram_color_indices = []
+        channel_index = 0
+
+        for channel_id, channel in enumerate(self.meta["channels"]):
+            propagators = []
+            for i, pid in enumerate(clean_pids(channel["propagators"])):
+                mass = self.process.get_mass(pid)
+                width = self.process.get_width(pid)
+                if i in channel["on_shell_propagators"]:
+                    e_min = mass - bw_cutoff * width
+                    e_max = mass + bw_cutoff * width
+                else:
+                    e_min = 0
+                    e_max = 0
+                propagators.append(ms.Propagator(
+                    mass=mass,
+                    width=width,
+                    integration_order=0,
+                    e_min=e_min,
+                    e_max=e_max,
+                ))
+            vertices = channel["vertices"]
+            diagrams = channel["diagrams"]
+            chan_permutations = [d["permutation"] for d in diagrams]
+            diag = ms.Diagram(
+                self.incoming_masses, self.outgoing_masses, propagators, vertices
+            )
+            chan_topologies = ms.Topology.topologies(diag)
+            topo_count = len(chan_topologies)
+
+            amp2_remap[diagrams[0]["diagram"]] = channel_index
+            channel_index_first = channel_index
+            symfact_index_first = len(symfact)
+            channel_index += 1
+            symfact.extend([None] * topo_count)
+            for d in diagrams[1:]:
+                amp2_remap[d["diagram"]] = channel_index
+                channel_index += 1
+                symfact.extend(range(symfact_index_first, symfact_index_first + topo_count))
+
+            topologies.append(chan_topologies)
+            permutations.append(chan_permutations)
+            channel_indices.append(list(range(channel_index_first, channel_index)))
+            channel_weight_indices.append([
+                [
+                    symfact_index_first + topo_index + i * topo_count
+                    for i in range(len(chan_permutations))
+                ]
+                for topo_index in range(topo_count)
+            ])
+            diagram_indices.append([d["diagram"] for d in diagrams])
+            diagram_color_indices.append([d["active_colors"] for d in diagrams])
+        self.multi_channel_data = MultiChannelData(
+            amp2_remap,
+            symfact,
+            topologies,
+            permutations,
+            channel_indices,
+            channel_weight_indices,
+            diagram_indices,
+            diagram_color_indices,
+        )
+        return self.multi_channel_data
+
+    def build_multichannel_phasespace(self) -> PhaseSpace:
+        (
+            amp2_remap,
+            symfact,
+            topologies,
+            permutations,
+            channel_indices,
+            channel_weight_indices,
+            diagram_indices,
+            _,
+        ) = self.build_multi_channel_data()
+
+        channels = []
+        t_channel_mode = self.t_channel_mode(
+            self.process.run_card["phasespace"]["t_channel"]
+        )
+        for channel_id, (chan_topologies, chan_permutations, chan_indices) in enumerate(zip(
+            topologies, permutations, channel_weight_indices
+        )):
+            topo_count = len(chan_topologies)
+            for topo_index, (topo, indices) in enumerate(zip(chan_topologies, chan_indices)):
+                mapping = ms.PhaseSpaceMapping(
+                    chan_topologies[0],
+                    self.process.e_cm,
+                    t_channel_mode=t_channel_mode,
+                    cuts=self.cuts,
+                    invariant_power=self.process.run_card["phasespace"]["invariant_power"],
+                    permutations=chan_permutations,
+                    leptonic=self.process.leptonic,
+                )
+                prefix = f"subproc{self.subproc_id}.channel{channel_id}"
+                if topo_count > 1:
+                    prefix += f".subchan{topo_index}"
+                discrete_before, discrete_after = self.build_discrete(
+                    len(chan_permutations), len(self.meta["flavors"]), prefix
+                )
+                channels.append(Channel(
+                    phasespace_mapping = mapping,
+                    adaptive_mapping = self.build_vegas(mapping, prefix),
+                    discrete_before = discrete_before,
+                    discrete_after = discrete_after,
+                    channel_weight_indices = indices,
+                    name = f"{channel_id}",
+                ))
+
+        chan_weight_remap = list(range(len(symfact))) #TODO: only construct if necessary
+        if self.process.run_card["phasespace"]["sde_strategy"] == "denominators":
+            prop_chan_weights = ms.PropagatorChannelWeights(
+                [topo[0] for topo in topologies], permutations, channel_indices
+            )
+            indices_for_subchan = channel_indices
+        else:
+            prop_chan_weights = None
+            indices_for_subchan = diagram_indices
+
+        if any(len(topos) > 1 for topos in topologies):
+            subchan_weights = ms.SubchannelWeights(
+                topologies, permutations, indices_for_subchan
+            )
+        else:
+            subchan_weights = None
+            if prop_chan_weights is None:
+                chan_weight_remap = [
+                    len(symfact) if remap == -1 else remap for remap in amp2_remap
+                ]
+
+        return PhaseSpace(
+            mode="multichannel",
+            channels=channels,
+            chan_weight_remap=chan_weight_remap,
+            symfact=symfact,
+            prop_chan_weights=prop_chan_weights,
+            subchan_weights=subchan_weights,
+        )
+
+    def build_flat_phasespace(self) -> PhaseSpace:
+        mapping = ms.PhaseSpaceMapping(
+            self.incoming_masses + self.outgoing_masses,
+            self.process.e_cm,
+            mode=self.t_channel_mode(self.process.run_card["phasespace"]["flat_mode"]),
+            cuts=self.cuts,
+            leptonic=self.process.leptonic,
+        )
+        prefix = f"subproc{self.subproc_id}.flat"
+        discrete_before, discrete_after = self.build_discrete(
+            1, len(self.meta["flavors"]), prefix
+        )
+        channel = Channel(
+            phasespace_mapping = mapping,
+            adaptive_mapping = self.build_vegas(mapping, prefix),
+            discrete_before = discrete_before,
+            discrete_after = discrete_after,
+            channel_weight_indices = [0],
+            name = "F",
+        )
+        return PhaseSpace(
+            mode="flat",
+            channels=[channel],
+            chan_weight_remap=[0] * self.meta["diagram_count"],
+            symfact=[None],
+        )
+
+    def simplify_phasespace(
+        self,
+        multi_phasespace: PhaseSpace,
+        flat_phasespace: PhaseSpace | None,
+        cross_sections: list[float]
+    ) -> PhaseSpace:
+        assert multi_phasespace.mode == "multichannel"
+
+        kept_count = self.process.run_card["phasespace"]["simplified_channel_count"]
+        if len(multi_phasespace.channels) <= kept_count:
+            return multi_phasespace
+
+        assert flat_phasespace is not None and flat_phasespace.mode == "flat"
+        #TODO: need to be careful here in the case of flavor sampling
+        #TODO: come up with some smarter heuristic than just channel cross section
+        #TODO: deal with resonances in a smart way
+        kept_channels = [
+            index
+            for index, cs in sorted(
+                enumerate(cross_sections), key=lambda pair: pair[1], reverse=True
+            )
+        ][:kept_count]
+
+        channels = []
+        channel_map = {}
+        symfact = []
+        for old_chan_index in kept_channels:
+            channel = multi_phasespace.channels[old_chan_index]
+            perm_count = max(1, channel.phasespace_mapping.channel_count())
+            channel_index = len(symfact)
+            symfact.append(None)
+            symfact.extend([channel_index] * (perm_count - 1))
+            channel_map.update({
+                old_index: new_index
+                for new_index, old_index in enumerate(
+                    channel.channel_weight_indices, start=channel_index
+                )
+            })
+            channels.append(Channel(
+                phasespace_mapping = channel.phasespace_mapping,
+                adaptive_mapping = channel.adaptive_mapping,
+                discrete_before = channel.discrete_before,
+                discrete_after = channel.discrete_after,
+                channel_weight_indices = list(range(
+                    channel_index, channel_index + perm_count
+                )),
+                name = channel.name,
+            ))
+
+        flat_channel = flat_phasespace.channels[0]
+        channels.append(Channel(
+            phasespace_mapping = flat_channel.phasespace_mapping,
+            adaptive_mapping = flat_channel.adaptive_mapping,
+            discrete_before = flat_channel.discrete_before,
+            discrete_after = flat_channel.discrete_after,
+            channel_weight_indices = [len(symfact)],
+            name = flat_channel.name,
+        ))
+        flat_index = len(symfact)
+        symfact.append(None)
+        channel_map[len(multi_phasespace.symfact)] = len(symfact)
+        chan_weight_remap = [
+            channel_map.get(remap, flat_index)
+            for remap in multi_phasespace.chan_weight_remap
+        ]
+
+        return PhaseSpace(
+            mode="both",
+            channels=channels,
+            chan_weight_remap=chan_weight_remap,
+            symfact=symfact,
+            prop_chan_weights=multi_phasespace.prop_chan_weights,
+            subchan_weights=multi_phasespace.subchan_weights,
+        )
+
+    def build_madnis(self, phasespace: PhaseSpace) -> PhaseSpace:
+        madnis_args = self.process.run_card["madnis"]
+        channels = []
+        for channel_id, channel in enumerate(phasespace.channels):
+            discrete_before = channel.discrete_before
+            if discrete_before is not None:
+                #TODO: build discrete flows
+                pass
+
+            perm_count = channel.phasespace_mapping.channel_count()
+            #cond_dim = perm_count if perm_count > 1 else 0
+            flow_dim = channel.phasespace_mapping.random_dim()
+            prefix = f"subproc{self.subproc_id}.channel{channel_id}"
+            flow = ms.Flow(
+                input_dim=flow_dim,
+                condition_dim=0,
+                prefix=prefix,
+                bin_count=madnis_args["flow_spline_bins"],
+                subnet_hidden_dim=madnis_args["flow_hidden_dim"],
+                subnet_layers=madnis_args["flow_layers"],
+                subnet_activation=self.activation(madnis_args["flow_activation"]),
+                invert_spline=madnis_args["flow_invert_spline"],
+            )
+            if channel.adaptive_mapping is None:
+                flow.initialize_globals(self.process.context)
+            else:
+                flow.initialize_from_vegas(
+                    self.process.context, channel.adaptive_mapping.grid_name()
+                )
+            #cond_dim += flow_dim
+
+            discrete_after = channel.discrete_after
+            if discrete_after is not None:
+                discrete_after = ms.DiscreteFlow(
+                    option_counts=[len(self.meta["flavors"])],
+                    prefix=f"{prefix}.discrete_after",
+                    dims_with_prior=[0],
+                    condition_dim=flow_dim,
+                    subnet_hidden_dim=madnis_args["discrete_hidden_dim"],
+                    subnet_layers=madnis_args["discrete_layers"],
+                    subnet_activation=self.activation(madnis_args["discrete_activation"]),
+                )
+                discrete_after.initialize_globals(self.process.context)
+
+            channels.append(Channel(
+                phasespace_mapping = channel.phasespace_mapping,
+                adaptive_mapping = flow,
+                discrete_before = discrete_before,
+                discrete_after = discrete_after,
+                channel_weight_indices = channel.channel_weight_indices,
+                name = channel.name,
+            ))
+
+        return PhaseSpace(
+            mode="both",
+            channels=channels,
+            chan_weight_remap=phasespace.chan_weight_remap,
+            symfact=phasespace.symfact,
+            cwnet=self.build_cwnet(len(phasespace.symfact)),
+            prop_chan_weights=phasespace.prop_chan_weights,
+            subchan_weights=phasespace.subchan_weights,
+        )
+
+    def build_vegas(self, mapping: ms.PhaseSpaceMapping, prefix: str) -> ms.VegasMapping:
+        if not self.process.run_card["vegas"]["enable"]:
+            return None
+
+        vegas = ms.VegasMapping(
+            mapping.random_dim(),
+            self.process.run_card["vegas"]["bins"],
+            prefix,
+        )
+        vegas.initialize_globals(self.process.context)
+        return vegas
+
+    def build_discrete(
+        self, permutation_count: int, flavor_count: int, prefix: str
+    ) -> tuple[ms.DiscreteSampler | None, ms.DiscreteSampler | None]:
+        #return None, None
+        discrete_before = None
+        #if permutation_count > 1:
+        #    discrete_before = ms.DiscreteSampler(
+        #        [permutation_count], f"{prefix}.discrete_before"
+        #    )
+        #    discrete_before.initialize_globals(self.process.context)
+        #else:
+        #    discrete_before = None
+
+        if flavor_count > 1:
+            discrete_after = ms.DiscreteSampler(
+                [flavor_count], f"{prefix}.discrete_after", [0]
+            )
+            discrete_after.initialize_globals(self.process.context)
+        else:
+            discrete_after = None
+
+        return discrete_before, discrete_after
+
+    def build_cwnet(self, channel_count: int) -> ms.ChannelWeightNetwork:
+        madnis_args = self.process.run_card["madnis"]
+        cwnet = ms.ChannelWeightNetwork(
+            channel_count=channel_count,
+            particle_count=self.particle_count,
+            hidden_dim=madnis_args["cwnet_hidden_dim"],
+            layers=madnis_args["cwnet_layers"],
+            activation=self.activation(madnis_args["cwnet_activation"]),
+            prefix=f"subproc{self.subproc_id}.cwnet",
+        )
+        cwnet.initialize_globals(self.process.context)
+        return cwnet
+
+    def t_channel_mode(self, name: str) -> ms.PhaseSpaceMapping.TChannelMode:
+        modes = {
+            "propagator": ms.PhaseSpaceMapping.propagator,
+            "rambo": ms.PhaseSpaceMapping.rambo,
+            "chili": ms.PhaseSpaceMapping.chili,
+        }
+        if name in modes:
+            return modes[name]
+        else:
+            raise ValueError(f"Invalid t-channel mode '{name}'")
+
+    def activation(self, name: str) -> ms.MLP.Activation:
+        activations = {
+            "relu": ms.MLP.relu,
+            "leaky_relu": ms.MLP.leaky_relu,
+            "elu": ms.MLP.elu,
+            "gelu": ms.MLP.gelu,
+            "sigmoid": ms.MLP.sigmoid,
+            "softplus": ms.MLP.softplus,
+        }
+        if name in activations:
+            return activations[name]
+        else:
+            raise ValueError(f"Invalid activation function '{name}'")
+
+    def build_integrands(
+        self,
+        phasespace: PhaseSpace,
+        flags: int = ms.EventGenerator.integrand_flags
+    ) -> list[ms.Integrand]:
+        flavors = [flav["options"][0] for flav in self.meta["flavors"]]
+        if self.matrix_element:
+            matrix_element = ms.MatrixElement(
+                self.matrix_element,
+                ms.Integrand.matrix_element_inputs,
+                ms.Integrand.matrix_element_outputs,
+                True,
+            )
+        else:
+            matrix_element = ms.MatrixElement(
+                0xBADCAFE,
+                self.particle_count,
+                ms.Integrand.matrix_element_inputs,
+                ms.Integrand.matrix_element_outputs,
+                self.meta["diagram_count"],
+                True,
+            )
+        pdf_grid = (
+            None
+            if len(flavors) > 1 or self.process.leptonic
+            else self.process.pdf_grid
+        )
+        cross_section = ms.DifferentialCrossSection(
+            matrix_element=matrix_element,
+            cm_energy=self.process.e_cm,
+            running_coupling=self.process.running_coupling,
+            energy_scale=self.scale,
+            pid_options=flavors,
+            has_pdf1=not self.process.leptonic,
+            has_pdf2=not self.process.leptonic,
+            pdf_grid1=pdf_grid,
+            pdf_grid2=pdf_grid,
+            has_mirror=self.meta["has_mirror_process"],
+            input_momentum_fraction=True,
+        )
+        integrands = []
+        for channel in phasespace.channels:
+            integrands.append(ms.Integrand(
+                channel.phasespace_mapping,
+                cross_section,
+                channel.adaptive_mapping,
+                channel.discrete_before,
+                channel.discrete_after,
+                self.process.pdf_grid,
+                self.scale,
+                phasespace.prop_chan_weights,
+                phasespace.subchan_weights,
+                phasespace.cwnet,
+                phasespace.chan_weight_remap,
+                len(phasespace.symfact),
+                flags,
+                channel.channel_weight_indices,
+            ))
+        #print(integrands[0].function())
+        #print(integrands[1].function())
+        return integrands
+
+    def train_madnis(self, phasespace: PhaseSpace) -> None:
+        print("Training MadNIS")
+        # do import here to make pytorch and MadNIS optional dependencies
+        from .train_madnis import train_madnis, MADNIS_INTEGRAND_FLAGS
+        start_time = get_start_time()
+        train_madnis(
+            self.build_integrands(phasespace, MADNIS_INTEGRAND_FLAGS),
+            phasespace,
+            self.process.run_card["madnis"],
+            self.process.context
+        )
+        print_run_time(start_time)
+
+
+def ask_edit_cards() -> None:
+    #TODO: these imports break when trying to generate flame graphs, so do them locally for now
+    from madgraph.interface.common_run_interface import CommonRunCmd, AskforEditCard
+    from madgraph.interface.extended_cmd import Cmd
+
+    #TODO: some rather disgusting monkey-patching to make editing cards work
+    class MG7Cmd(Cmd):
+        def __init__(self):
+            super().__init__(".", {})
+            self.proc_characteristics = None
+        def do_open(self, line):
+            CommonRunCmd.do_open(self, line)
+        def check_open(self, args):
+            CommonRunCmd.check_open(self, args)
+    old_define_paths = AskforEditCard.define_paths
+    def define_paths(self, **opt):
+        old_define_paths(self, **opt)
+        self.paths["run"] = os.path.join(self.me_dir, "Cards", "run_card.toml")
+        self.paths["run_card.toml"] = os.path.join(self.me_dir, "Cards", "run_card.toml")
+    AskforEditCard.define_paths = define_paths
+    AskforEditCard.reload_card = lambda self, path: None
+
+    cmd = MG7Cmd()
+    CommonRunCmd.ask_edit_card_static(
+        ["param_card.dat", "run_card.toml"],
+        pwd=".",
+        ask=cmd.ask,
+        plot=False
+    )
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", action="store_false", dest="ask_edit_cards")
+    args = parser.parse_args()
+    if args.ask_edit_cards:
+        ask_edit_cards()
+
+    process = MadgraphProcess()
+    process.survey()
+    process.train_madnis()
+    process.generate_events()
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
new file mode 100644
index 0000000000..0af6eab2bf
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -0,0 +1,124 @@
+[run]
+run_name = "run"
+device = "cpu" # options: cpu, cuda
+# options:
+#   -1 to choose automatically
+#   on x86: 1, 4, 8
+#   on Apple silicon: 1, 2
+simd_vector_size = -1
+thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
+output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe
+verbosity = "pretty" # options: silent, pretty, log
+dummy_matrix_element = false
+
+[beam]
+e_cm = 13000.0
+leptonic = false
+pdf = "NNPDF23_lo_as_0130_qed"
+fixed_ren_scale = true
+fixed_fact_scale = true
+ren_scale = 91.188
+fact_scale1 = 91.188
+fact_scale2 = 91.188
+# options: transverse_energy, transverse_mass, half_transverse_mass, partonic_energy
+dynamical_scale_choice = "half_transverse_mass"
+
+[generation]
+events = 10000
+max_overweight_truncation = 0.01
+freeze_max_weight_after = 10000
+batch_size = 1000
+survey_min_iters = 3
+survey_max_iters = 3
+survey_target_precision = 0.1
+
+[vegas]
+enable = true
+bins = 64
+damping = 0.4
+optimization_patience = 5
+optimization_threshold = 0.9
+start_batch_size = 1000
+max_batch_size = 32000
+
+[phasespace]
+mode = "both" #options: multichannel, flat, both
+sde_strategy = "diagrams" #options: diagrams, denominators
+decays = "all" # options: all, massive, none
+t_channel = "propagator" # options: propagator, rambo, chili
+flat_mode = "rambo" # options: propagator, rambo, chili
+simplified_channel_count = 10
+invariant_power = 0.7
+bw_cutoff = 15
+
+[multiparticles]
+jet = [1, 2, 3, 4, -1, -2, -3, -4, 21]
+bottom = [-5, 5]
+lepton = [11, 13, 15, -11, -13, -15]
+missing = [12, 14, 16, -12, -14, -16]
+photon = [22]
+
+[cuts]
+# possible groups: jet, bottom, lepton, missing, photon
+# possible observables: pt, eta, dR, mass, sqrt_s
+#     (mass is for pairs of particles from the same group, sqrt_s is for all outgoing particles)
+# for all cuts, min or max can be specified
+
+jet-pt.min = 20.0
+jet-eta_abs.max = 5.0
+jet-delta_r.min = 0.4
+
+lepton-pt.min = 10.0
+lepton-eta_abs.max = 2.5
+lepton-delta_r.min = 0.4
+
+jet-lepton-delta_r.min=0.4
+
+sqrt_s.min = 0.0
+
+[histograms]
+
+
+[madnis]
+enable = false
+
+# normalizing flow parameters
+flow_hidden_dim = 64
+flow_layers = 3
+flow_spline_bins = 10
+flow_activation = "leaky_relu" # options: relu, leaky_relu, elu, gelu, sigmoid, softplus
+flow_invert_spline = false
+
+# discrete dimensions
+discrete_hidden_dim = 64
+discrete_layers = 3
+discrete_activation = "leaky_relu" # options: relu, leaky_relu, elu, gelu, sigmoid, softplus
+
+# channel weight network
+cwnet_hidden_dim = 64
+cwnet_layers = 3
+cwnet_activation = "leaky_relu" # options: relu, leaky_relu, elu, gelu, sigmoid, softplus
+
+# training parameters
+loss = "stratified_variance" # options: stratified_variance, kl_divergence, rkl_divergence
+train_batches = 1000
+log_interval = 100
+batch_size_offset = 512
+batch_size_per_channel = 128
+lr = 1e-3
+lr_decay = 0.01
+lr_max = 3e-3
+lr_scheduler = "cosine" # options: exponential, inverse, onecycle, cosine, none
+train_mcw = true
+buffer_capacity = 0
+minimum_buffer_size = 50000
+buffered_steps = 0
+uniform_channel_ratio = 0.1
+integration_history_length = 1000
+max_stored_channel_weights = 100
+channel_dropping_threshold = 0.001
+channel_dropping_interval = 300
+drop_zero_integrands = true
+batch_size_threshold = 0.5
+channel_grouping_mode = "uniform" # options: none, uniform, learned
+fixed_cwnet_fraction = 0.33
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/train_madnis.py b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/train_madnis.py
new file mode 100644
index 0000000000..9bd83b8ad1
--- /dev/null
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/train_madnis.py
@@ -0,0 +1,124 @@
+import torch
+import numpy as np
+import madspace as ms
+from madspace.madnis import (
+    ChannelGrouping,
+    Integrator,
+    stratified_variance,
+    kl_divergence,
+    rkl_divergence,
+    build_madnis_integrand,
+    MADNIS_INTEGRAND_FLAGS,
+)
+
+
+def train_madnis(
+    integrands: list[ms.Integrand],
+    phasespace,
+    madnis_args: dict,
+    context: ms.Context,
+) -> None:
+    channel_grouping = (
+        None if phasespace.symfact is None else ChannelGrouping(phasespace.symfact)
+    )
+    madnis_integrand, flow, cwnet = build_madnis_integrand(
+        integrands, phasespace.cwnet, channel_grouping, context
+    )
+
+    loss = {
+        "stratified_variance": stratified_variance,
+        "kl_divergence": kl_divergence,
+        "rkl_divergence": rkl_divergence,
+    }[madnis_args["loss"]]
+
+    def build_scheduler(optimizer):
+        if madnis_args["lr_scheduler"] == "exponential":
+            decay_rate = madnis_args["lr_decay"] ** (
+                1 / max(madnis_args["train_batches"], 1)
+            )
+            return torch.optim.lr_scheduler.ExponentialLR(
+                optimizer, gamma=decay_rate
+            )
+        elif madnis_args["lr_scheduler"] == "onecycle":
+            return torch.optim.lr_scheduler.OneCycleLR(
+                optimizer,
+                max_lr=madnis_args["lr_max"],
+                total_steps=madnis_args["train_batches"],
+            )
+        elif madnis_args["lr_scheduler"] == "cosine":
+            return torch.optim.lr_scheduler.CosineAnnealingLR(
+                optimizer, T_max=madnis_args["train_batches"]
+            )
+        else:
+            return None
+
+    madevent_device = context.device()
+    if madevent_device == ms.cpu_device():
+        device = torch.device("cpu")
+    elif madevent_device == ms.cuda_device():
+        device = torch.device("cpu")
+    elif madevent_device == ms.hip_device():
+        device = torch.device("rocm")
+
+    integrator = Integrator(
+        integrand=madnis_integrand,
+        flow=flow,
+        train_channel_weights=cwnet is not None,
+        cwnet=cwnet,
+        loss=loss,
+        batch_size=madnis_args["batch_size_offset"],
+        batch_size_per_channel=madnis_args["batch_size_per_channel"],
+        learning_rate=madnis_args["lr"],
+        scheduler=build_scheduler,
+        uniform_channel_ratio=madnis_args["uniform_channel_ratio"],
+        integration_history_length=madnis_args["integration_history_length"],
+        drop_zero_integrands=madnis_args["drop_zero_integrands"],
+        batch_size_threshold=madnis_args["batch_size_threshold"],
+        buffer_capacity=madnis_args["buffer_capacity"],
+        minimum_buffer_size=madnis_args["minimum_buffer_size"],
+        buffered_steps=madnis_args["buffered_steps"],
+        max_stored_channel_weights=madnis_args["max_stored_channel_weights"],
+        channel_dropping_threshold=madnis_args["channel_dropping_threshold"],
+        channel_dropping_interval=madnis_args["channel_dropping_interval"],
+        channel_grouping_mode="uniform",
+        freeze_cwnet_iteration=int(
+            madnis_args["train_batches"] * (1 - madnis_args["fixed_cwnet_fraction"])
+        ),
+        device=torch.device("cpu" if context.device() == ms.cpu_device() else "cuda:0"),
+        dtype=torch.float64,
+    )
+
+    online_losses = []
+    buffered_losses = []
+    log_interval = madnis_args["log_interval"]
+    def callback(status):
+        if status.buffered:
+            buffered_losses.append(status.loss)
+        else:
+            online_losses.append(status.loss)
+        batch = status.step + 1
+        if batch % log_interval != 0:
+            return
+        online_loss = np.mean(online_losses)
+        info = [f"Batch {batch:6d}: loss={online_loss:.6f}"]
+        if len(buffered_losses) > 0:
+            buffered_loss = np.mean(buffered_losses)
+            info.append(f"buf={buffered_loss:.6f}")
+        if status.learning_rate is not None:
+            info.append(f"lr={status.learning_rate:.4e}")
+        if status.dropped_channels > 0:
+            info.append(f"drop={status.dropped_channels}")
+
+        print(", ".join(info))
+        online_losses.clear()
+        buffered_losses.clear()
+
+    integrator.train(madnis_args["train_batches"], callback)
+
+    phasespace.channels = [
+        channel
+        for channel, active in zip(
+            phasespace.channels, integrator.active_channels_mask
+        )
+        if active
+    ]
diff --git a/PLUGIN/CUDACPP_OUTPUT/model_handling.py b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
index bc961f6d60..b1bc718d62 100644
--- a/PLUGIN/CUDACPP_OUTPUT/model_handling.py
+++ b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
@@ -1746,19 +1746,19 @@ def generate_process_files(self):
         files.ln(pjoin(self.path, 'cudacpp.mk'), self.path, 'makefile')
         # Add link to makefile_original.mk, PR #1052
         files.ln(pjoin(self.path, '..', 'makefile_original.mk'), self.path, 'makefile_original.mk')
-        # Add symbolic links in the test directory
-        files.ln(pjoin(self.path + '/../../test', 'cudacpp_test.mk'), self.path + '/../../test', 'makefile')
-        # Add reference file in the test directory (if it exists for this process)
-        import pathlib
-        pathlib.Path(self.path + '/../../test/ref/.keepme').touch()
-        ###template_ref = 'dump_CPUTest.'+self.process_name+'.txt'
-        template_ref = self.template_path + '/../../../test/ref/' + 'dump_CPUTest.' + self.process_name + '.txt'
-        for ref in template_ref, template_ref + '2' : # two different reference files for tests without/with multichannel #896
-            if os.path.exists( ref ):
-                ###misc.sprint( 'Copying test reference file: ', ref )
-                PLUGIN_export_cpp.cp( ref, self.path + '/../../test/ref' )
-            ###else:
-                ###misc.sprint( 'Test reference file does not exist and will not be copied: ', ref )
+        # # Add symbolic links in the test directory
+        # files.ln(pjoin(self.path + '/../../test', 'cudacpp_test.mk'), self.path + '/../../test', 'makefile')
+        # # Add reference file in the test directory (if it exists for this process)
+        # import pathlib
+        # pathlib.Path(self.path + '/../../test/ref/.keepme').touch()
+        # ###template_ref = 'dump_CPUTest.'+self.process_name+'.txt'
+        # template_ref = self.template_path + '/../../../test/ref/' + 'dump_CPUTest.' + self.process_name + '.txt'
+        # for ref in template_ref, template_ref + '2' : # two different reference files for tests without/with multichannel #896
+        #     if os.path.exists( ref ):
+        #         ###misc.sprint( 'Copying test reference file: ', ref )
+        #         PLUGIN_export_cpp.cp( ref, self.path + '/../../test/ref' )
+        #     ###else:
+        #         ###misc.sprint( 'Test reference file does not exist and will not be copied: ', ref )
 
     # SR - generate CMakeLists.txt file inside the P* directory
     def edit_CMakeLists(self):
diff --git a/PLUGIN/CUDACPP_OUTPUT/output.py b/PLUGIN/CUDACPP_OUTPUT/output.py
index 1f76172cda..b2c669b1c1 100644
--- a/PLUGIN/CUDACPP_OUTPUT/output.py
+++ b/PLUGIN/CUDACPP_OUTPUT/output.py
@@ -49,7 +49,7 @@
 
 # AV - define the plugin's process exporter
 # (NB: this is the plugin's main class, enabled in the new_output dictionary in __init__.py)
-class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterCPP):
+class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterMG7):
     # Class structure information
     #  - object
     #  - VirtualExporter(object) [in madgraph/iolibs/export_v4.py]
@@ -94,12 +94,12 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterCPP):
     from_template = {'.': [s+'.clang-format', s+'CMake/CMakeLists.txt',
                            s+'COPYRIGHT', s+'COPYING', s+'COPYING.LESSER' ],
                      'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'],
-                     'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
+                     'src': [s+'mg7/api.h', s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc',
                              s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h',
                              s+'gpu/constexpr_math.h',
                              s+'gpu/cudacpp_config.mk',
                              s+'CMake/src/CMakeLists.txt' ],
-                     'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
+                     'SubProcesses': [s+'mg7/api.cpp', s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
                                       s+'gpu/color_sum.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',
@@ -123,7 +123,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterCPP):
                                       s+'gpu/cudacpp_overlay.mk', s+'gpu/makefile_wrapper.mk',
                                       s+'gpu/umami.h', s+'gpu/umami.cc',
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
-                     'test': [s+'gpu/cudacpp_test.mk']}
+                     'Cards': [s+'mg7/run_card.toml'] }
 
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
                     'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
@@ -176,21 +176,10 @@ def __init__(self, *args, **kwargs):
         return super().__init__(*args, **kwargs)
 
     # AV - overload the default version: create CMake directory, do not create lib directory
-    def copy_template(self, model):
+    def copy_template_simd(self, model):
         misc.sprint('Entering PLUGIN_ProcessExporter.copy_template (initialise the directory)')
-        try: os.mkdir(self.dir_path)
-        except os.error as error: logger.warning(error.strerror + ' ' + self.dir_path)
+        super().copy_template_simd(model)
         with misc.chdir(self.dir_path):
-            logger.info('Creating subdirectories in directory %s' % self.dir_path)
-            for d in ['src', 'Cards', 'SubProcesses', 'CMake', 'test', 'test/ref']: # AV - added CMake, test, test/ref; removed lib
-                try: os.mkdir(d)
-                except os.error as error: logger.warning(error.strerror + ' ' + os.path.join(self.dir_path,d))
-            # Write param_card
-            open(os.path.join('Cards','param_card.dat'), 'w').write(model.write_param_card())
-            # Copy files in various subdirectories
-            for key in self.from_template:
-                for f in self.from_template[key]:
-                    PLUGIN_export_cpp.cp(f, key) # NB this assumes directory key exists...
             # Copy src makefile
             if self.template_src_make:
                 makefile_src = self.read_template_file(self.template_src_make) % {'model': self.get_model_name(model.get('name'))}
@@ -199,10 +188,10 @@ def copy_template(self, model):
             if self.template_Sub_make:
                 makefile = self.read_template_file(self.template_Sub_make) % {'model': self.get_model_name(model.get('name'))}
                 open(os.path.join('SubProcesses', 'cudacpp.mk'), 'w').write(makefile)
-            # Copy test makefile
-            if self.template_tst_make:
-                makefile_test = self.read_template_file(self.template_tst_make) % {'model': self.get_model_name(model.get('name'))}
-                open(os.path.join('test', 'cudacpp_test.mk'), 'w').write(makefile_test)
+            # # Copy test makefile
+            # if self.template_tst_make:
+            #     makefile_test = self.read_template_file(self.template_tst_make) % {'model': self.get_model_name(model.get('name'))}
+            #     open(os.path.join('test', 'cudacpp_test.mk'), 'w').write(makefile_test)
 
     # OM - overload export_v4.py version to add additional_clean section (and avoid patchMad.sh for Source/makefile)
     def write_source_makefile(self, writer, model=None, default=None):
@@ -219,6 +208,7 @@ def write_source_makefile(self, writer, model=None, default=None):
 
     # AV - add debug printouts (in addition to the default one from OM's tutorial)
     def generate_subprocess_directory(self, subproc_group, fortran_model, me=None):
+        # used only for standalone
         misc.sprint('Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory)')
         misc.sprint('  type(subproc_group)=%s'%type(subproc_group)) # e.g. madgraph.core.helas_objects.HelasMatrixElement
         misc.sprint('  type(fortran_model)=%s'%type(fortran_model)) # e.g. madgraph.iolibs.helas_call_writers.GPUFOHelasCallWriter
@@ -281,6 +271,8 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             self.add_madevent_plugin_fct() # Added by OM
         # do not call standard finalize since is this is already done...
         #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
+        else:
+            super().finalize()
 
     # AV (default from OM's tutorial) - overload settings and add a debug printout
     def modify_grouping(self, matrix_element):
@@ -307,6 +299,42 @@ def add_madevent_plugin_fct(self):
         files.cp(pjoin(plugin_path, 'launch_plugin.py'), pjoin(self.dir_path, 'bin', 'internal'))
         files.ln(pjoin(self.dir_path, 'lib'),  pjoin(self.dir_path, 'SubProcesses'))
 
+class MG7_SIMD_ProcessExporter(PLUGIN_ProcessExporter):
+    lib_suffix = "cpp"
+
+    @classmethod
+    def change_output_args(cls, args, cmd):
+        """ """
+        args.append('--hel_recycling=False')
+        # path relative to the process directory
+        args.append('--simd=lib/libmg5amc_{processid_short}_' + cls.lib_suffix + ".so")
+        if 'vector_size' not in ''.join(args):
+            args.append('--vector_size=16')
+        if 'nb_wrap' not in ''.join(args):
+            args.append('--nb_wrap=1')
+        return args
+
+class MG7_GPU_ProcessExporter(PLUGIN_ProcessExporter):
+    lib_suffix = "gpu"
+
+    @classmethod
+    def change_output_args(cls, args, cmd):
+        """ """
+        args.append('--hel_recycling=False')
+        # path relative to the process directory
+        args.append('--gpu=lib/libmg5amc_{processid_short}_' + cls.lib_suffix + ".so")
+        if 'vector_size' not in ''.join(args):
+            args.append('--vector_size=32')
+        if 'nb_wrap' not in ''.join(args):
+            args.append('--nb_wrap=512')
+        return args
+
+class MG7_CUDA_ProcessExporter(MG7_GPU_ProcessExporter):
+    lib_suffix = "cuda"
+
+class MG7_HIP_ProcessExporter(MG7_GPU_ProcessExporter):
+    lib_suffix = "hip"
+
 #------------------------------------------------------------------------------------
 
 class PLUGIN_ProcessExporter_MadEvent(PLUGIN_ProcessExporter):
diff --git a/madgraph/iolibs/export_cpp.py b/madgraph/iolibs/export_cpp.py
index 4e35316fdc..f8c381db1e 100755
--- a/madgraph/iolibs/export_cpp.py
+++ b/madgraph/iolibs/export_cpp.py
@@ -3179,17 +3179,11 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         output_options = args[1]["output_options"]
         simd_opt = output_options.get("simd")
-        cuda_opt = output_options.get("cuda")
-        hip_opt = output_options.get("hip")
+        gpu_opt = output_options.get("gpu")
         if simd_opt is not None:
-            self.matrix_element_path = os.path.abspath(simd_opt)
-            self.matrix_element_gpu = None
-        elif cuda_opt is not None:
-            self.matrix_element_path = os.path.abspath(cuda_opt)
-            self.matrix_element_gpu = "cuda"
-        elif hip_opt is not None:
-            self.matrix_element_path = os.path.abspath(hip_opt)
-            self.matrix_element_gpu = "hip"
+            self.matrix_element_path = os.path.abspath(os.path.join(self.dir_path, simd_opt))
+        elif gpu_opt is not None:
+            self.matrix_element_path = os.path.abspath(os.path.join(self.dir_path, gpu_opt))
         else:
             self.matrix_element_path = None
         self.process_info = []
@@ -3197,31 +3191,32 @@ def __init__(self, *args, **kwargs):
     def generate_subprocess_directory(
         self, matrix_element, cpp_helas_call_writer, proc_number=None
     ):
-        if self.matrix_element_path is not None:
-            process_exporter_cpp = self.oneprocessclass(matrix_element,cpp_helas_call_writer)
-            proc_dir_name = "P%d_%s" % (process_exporter_cpp.process_number, 
-                                        process_exporter_cpp.process_name)
-            dirpath = pjoin(self.dir_path, 'SubProcesses', proc_dir_name)
-            os.mkdir(dirpath)
-
-            suffix = self.matrix_element_gpu or "cpp"
-            logger.info('Creating files in directory %s' % dirpath)
-            common_lib_name = f"libmg5amc_common_{suffix}.so"
-            subproc_lib_name = f"libmg5amc_{process_exporter_cpp.process_name}_{suffix}.so"
-            os.symlink(
-                os.path.join(self.matrix_element_path, "lib", subproc_lib_name),
-                os.path.join(dirpath, "api.so")
-            )
-            os.symlink(
-                os.path.join(self.matrix_element_path, "lib", common_lib_name),
-                os.path.join(dirpath, common_lib_name)
-            )
-
-        else:
-            proc_dir_name = super().generate_subprocess_directory(
-                matrix_element, cpp_helas_call_writer, proc_number=None
-            )
-        self.process_info.append(get_subprocess_info(matrix_element, proc_dir_name))
+        # if self.matrix_element_path is not None:
+        #     process_exporter_cpp = self.oneprocessclass(matrix_element,cpp_helas_call_writer)
+        #     proc_dir_name = "P%d_%s" % (process_exporter_cpp.process_number, 
+        #                                 process_exporter_cpp.process_name)
+        #     dirpath = pjoin(self.dir_path, 'SubProcesses', proc_dir_name)
+        #     os.mkdir(dirpath)
+        #
+        #     suffix = self.matrix_element_gpu or "cpp"
+        #     logger.info('Creating files in directory %s' % dirpath)
+        #     common_lib_name = f"libmg5amc_common_{suffix}.so"
+        #     subproc_lib_name = f"libmg5amc_{process_exporter_cpp.process_name}_{suffix}.so"
+        #     os.symlink(
+        #         os.path.join(self.matrix_element_path, "lib", subproc_lib_name),
+        #         os.path.join(dirpath, "api.so")
+        #     )
+        #     os.symlink(
+        #         os.path.join(self.matrix_element_path, "lib", common_lib_name),
+        #         os.path.join(dirpath, common_lib_name)
+        #     )
+
+        proc_dir_name = super().generate_subprocess_directory(
+            matrix_element, cpp_helas_call_writer, proc_number=None
+        )
+        process_exporter_cpp = self.oneprocessclass(matrix_element,cpp_helas_call_writer)
+        me_lib_path = self.matrix_element_path.format(processid_short = f"mg5amc_{process_exporter_cpp.process_name}")
+        self.process_info.append(get_subprocess_info(matrix_element, proc_dir_name, me_lib_path))
 
     def copy_template_simd(self, model):
         try:
@@ -3244,7 +3239,7 @@ def copy_template_simd(self, model):
 
             # Copy the needed src files
             from_template = {
-                **self.from_template, "SubProcesses": []
+                "SubProcesses": [], **self.from_template
             }
             for key, files in from_template.items():
                 for f in files:
diff --git a/madgraph/iolibs/export_mg7.py b/madgraph/iolibs/export_mg7.py
index 0da9060cb8..060a1df0b7 100644
--- a/madgraph/iolibs/export_mg7.py
+++ b/madgraph/iolibs/export_mg7.py
@@ -5,7 +5,7 @@
 from madgraph.various.diagram_symmetry import find_symmetry, IdentifySGConfigTag
 
 
-def get_subprocess_info(matrix_element, proc_dir_name):
+def get_subprocess_info(matrix_element, proc_dir_name, lib_me_path):
     model = matrix_element.get("processes")[0].get("model")
     amplitude = matrix_element.get("base_amplitude")
 
@@ -151,7 +151,7 @@ def get_subprocess_info(matrix_element, proc_dir_name):
         "incoming": incoming,
         "outgoing": outgoing,
         "channels": channels,
-        "path": os.path.join("SubProcesses", proc_dir_name, "api.so"),
+        "path": lib_me_path,
         "flavors": flavors,
         "color_flows": color_flows,
         "pdg_color_types": pdg_color_types,

From 127968fefcf868d8349bfd9a6528e287b8e7955f Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 09:47:10 +0100
Subject: [PATCH 03/33] Update UMAMI with flavor indices

---
 .../iolibs/template_files/gpu/umami.cc        | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
index d83768a43d..031e6ec89d 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
@@ -19,6 +19,7 @@ namespace
   void* initialize_impl(
     const fptype* momenta,
     const fptype* couplings,
+    const unsigned int* flavor_indices,
     fptype* matrix_elements,
 #ifdef MGONGPUCPP_GPUIMPL
     fptype* color_jamps,
@@ -29,7 +30,7 @@ namespace
   {
     bool is_good_hel[CPPProcess::ncomb];
     sigmaKin_getGoodHel(
-      momenta, couplings, matrix_elements, numerators, denominators,
+      momenta, couplings, flavor_indices, matrix_elements, numerators, denominators,
 #ifdef MGONGPUCPP_GPUIMPL
       color_jamps,
 #endif
@@ -42,6 +43,7 @@ namespace
   void initialize(
     const fptype* momenta,
     const fptype* couplings,
+    const unsigned int* flavor_indices,
     fptype* matrix_elements,
 #ifdef MGONGPUCPP_GPUIMPL
     fptype* color_jamps,
@@ -51,7 +53,7 @@ namespace
     std::size_t count )
   {
     // static local initialization is called exactly once in a thread-safe way
-    static void* dummy = initialize_impl( momenta, couplings, matrix_elements,
+    static void* dummy = initialize_impl( momenta, couplings, flavor_indices, matrix_elements,
 #ifdef MGONGPUCPP_GPUIMPL
                                           color_jamps,
 #endif
@@ -88,11 +90,13 @@ namespace
     const double* color_random_in,
     const double* diagram_random_in,
     const double* alpha_s_in,
+    const unsigned int* flavor_indices_in,
     fptype* momenta,
     fptype* helicity_random,
     fptype* color_random,
     fptype* diagram_random,
     fptype* g_s,
+    unsigned int* flavor_indices,
     std::size_t count,
     std::size_t stride,
     std::size_t offset )
@@ -105,6 +109,7 @@ namespace
     helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5;
     color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5;
     g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+    flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0;
   }
 
   __global__ void copy_outputs(
@@ -236,7 +241,7 @@ extern "C"
   {
     const double* momenta_in = nullptr;
     const double* alpha_s_in = nullptr;
-    const int* flavor_in = nullptr; // TODO: unused
+    const unsigned int* flavor_indices_in = nullptr;
     const double* random_color_in = nullptr;
     const double* random_helicity_in = nullptr;
     const double* random_diagram_in = nullptr;
@@ -254,7 +259,7 @@ extern "C"
           alpha_s_in = static_cast<const double*>( input );
           break;
         case UMAMI_IN_FLAVOR_INDEX:
-          flavor_in = static_cast<const int*>( input );
+          flavor_indices_in = static_cast<const unsigned int*>( input );
           break;
         case UMAMI_IN_RANDOM_COLOR:
           random_color_in = static_cast<const double*>( input );
@@ -322,12 +327,13 @@ extern "C"
     fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps;
     fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps;
     int *helicity_index, *color_index;
-    unsigned int* diagram_index;
+    unsigned int *flavor_indices, *diagram_index;
 
     std::size_t n_coup = mg5amcGpu::Parameters_dependentCouplings::ndcoup;
     gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream );
     gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream );
     gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream );
+    gpuMallocAsync( &flavor_indices, rounded_count * sizeof( unsigned int ), gpu_stream );
     gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream );
     gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream );
     gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream );
@@ -347,11 +353,13 @@ extern "C"
       random_color_in,
       random_diagram_in,
       alpha_s_in,
+      flavor_indices_in,
       momenta,
       helicity_random,
       color_random,
       diagram_random,
       g_s,
+      flavor_indices,
       count,
       stride,
       offset );
@@ -365,13 +373,14 @@ extern "C"
     if( !instance->initialized )
     {
       initialize(
-        momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count );
+        momenta, couplings, flavor_indices, matrix_elements, color_jamps, numerators, denominators, rounded_count );
       instance->initialized = true;
     }
 
     sigmaKin(
       momenta,
       couplings,
+      flavor_indices,
       helicity_random,
       color_random,
       nullptr,
@@ -411,6 +420,7 @@ extern "C"
 
     gpuFreeAsync( momenta, gpu_stream );
     gpuFreeAsync( couplings, gpu_stream );
+    gpuFreeAsync( flavor_indices, gpu_stream );
     gpuFreeAsync( g_s, gpu_stream );
     gpuFreeAsync( helicity_random, gpu_stream );
     gpuFreeAsync( color_random, gpu_stream );
@@ -432,6 +442,7 @@ extern "C"
     HostBufferBase<fptype, false> momenta( rounded_count * CPPProcess::npar * 4 );
     HostBufferBase<fptype, false> couplings( rounded_count * mg5amcCpu::Parameters_dependentCouplings::ndcoup * 2 );
     HostBufferBase<fptype, false> g_s( rounded_count );
+    HostBufferBase<unsigned int, false> flavor_indices( rounded_count );
     HostBufferBase<fptype, false> helicity_random( rounded_count );
     HostBufferBase<fptype, false> color_random( rounded_count );
     HostBufferBase<fptype, false> diagram_random( rounded_count );
@@ -448,6 +459,7 @@ extern "C"
       color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
       diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
       g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
+      flavor_indices[i_event] = flavor_indices_in ? flavor_indices[i_event + offset] : 0;
     }
     computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
 
@@ -457,6 +469,7 @@ extern "C"
       initialize(
         momenta.data(),
         couplings.data(),
+        flavor_indices.data(),
         matrix_elements.data(),
         numerators.data(),
         denominators.data(),
@@ -467,6 +480,7 @@ extern "C"
     sigmaKin(
       momenta.data(),
       couplings.data(),
+      flavor_indices.data(),
       helicity_random.data(),
       color_random.data(),
       nullptr,

From 7c2f5ce01a716526f58afe6b6d8104923280596a Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:50:33 +0100
Subject: [PATCH 04/33] Remove multichannel ifdefs, ok to cherry-pick

---
 .../iolibs/template_files/gpu/MadgraphTest.h  |  2 --
 .../gpu/MatrixElementKernels.cc               | 26 ----------------
 .../template_files/gpu/MatrixElementKernels.h |  4 ---
 .../gpu/MemoryAccessDenominators.h            |  2 --
 .../gpu/MemoryAccessNumerators.h              |  2 --
 .../iolibs/template_files/gpu/MemoryBuffers.h |  4 ---
 .../iolibs/template_files/gpu/coloramps.h     |  5 ----
 .../iolibs/template_files/gpu/mgOnGpuConfig.h | 13 --------
 .../iolibs/template_files/gpu/process_cc.inc  |  2 --
 .../gpu/process_function_definitions.inc      | 30 -------------------
 .../iolibs/template_files/gpu/process_h.inc   | 12 --------
 .../template_files/gpu/process_matrix.inc     |  2 --
 .../gpu/process_sigmaKin_function.inc         | 24 ---------------
 .../iolibs/template_files/gpu/runTest.cc      | 18 -----------
 PLUGIN/CUDACPP_OUTPUT/model_handling.py       | 22 --------------
 15 files changed, 168 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
index a278f8849b..62b8e264af 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -77,9 +77,7 @@ namespace
           referenceData[batchNo].ChanIds.resize( evtNo + 1 );
         std::string dummy;
         lineStr >> dummy >> referenceData[batchNo].ChanIds[evtNo];
-#ifndef MGONGPU_SUPPORTS_MULTICHANNEL
         referenceData[batchNo].ChanIds[evtNo] = 0; // disable ChanId comparison if multichannel is not supported #976
-#endif
       }
       else if( line.find( "SelHel" ) != std::string::npos )
       {
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index c4ba05cb42..3c6b9de905 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -164,10 +164,8 @@ namespace mg5amcCpu
     : MatrixElementKernelBase( momenta, gs, iflavorVec, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( nevt )
     , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     , m_numerators( nevt * CPPProcess::ndiagrams )
     , m_denominators( nevt )
-#endif
   {
     //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
@@ -207,11 +205,7 @@ namespace mg5amcCpu
     HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() );
-#else
-    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
-#endif
     // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
@@ -222,13 +216,8 @@ namespace mg5amcCpu
   void MatrixElementKernelHost::computeMatrixElements( const bool useChannelIds )
   {
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() );
-#else
-    assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
-#endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
     MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
@@ -321,11 +310,9 @@ namespace mg5amcGpu
     , m_couplings( this->nevt() )
     , m_pHelMEs()
     , m_pHelJamps()
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     , m_pHelNumerators()
     , m_pHelDenominators()
     , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
-#endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
@@ -361,11 +348,9 @@ namespace mg5amcGpu
     }
     // Create the "one-helicity" jamp buffer that will be used for helicity filtering
     m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
     m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
-#endif
     // Decide at runtime whether to use BLAS for color sums
     // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
     static bool first = true;
@@ -453,11 +438,7 @@ namespace mg5amcGpu
     // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
     const int nevt = m_gpublocks * m_gputhreads;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelNumerators->data(), m_pHelDenominators->data(), hstIsGoodHel.data(), nevt );
-#else
-    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_matrixElements.data(), m_pHelJamps->data(), hstIsGoodHel.data(), nevt );
-#endif
     // ... 0d3. Set good helicity list in host static memory
     int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
     assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
@@ -480,12 +461,10 @@ namespace mg5amcGpu
     // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
     m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
     // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
     m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) );
     m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
-#endif
 #ifndef MGONGPU_HAS_NO_BLAS
     // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -512,13 +491,8 @@ namespace mg5amcGpu
     fptype2* ghelAllBlasTmp = nullptr;
     gpuBlasHandle_t* pBlasHandle = nullptr;
 #endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
     sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
-#else
-    assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_iflavorVec.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads );
-#endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
     copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?!
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 693eeff489..c47f59aa57 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -149,13 +149,11 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     HostBufferCouplings m_couplings;
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // The buffer for the event-by-event numerators of multichannel factors
     HostBufferNumerators m_numerators;
 
     // The buffer for the event-by-event denominators of multichannel factors
     HostBufferDenominators m_denominators;
-#endif
   };
 #endif
 
@@ -206,7 +204,6 @@ namespace mg5amcCpu
     // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
     std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
     std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
@@ -215,7 +212,6 @@ namespace mg5amcCpu
 
     // The super-buffer of ncolor jamp2 buffers
     DeviceBufferSimple m_colJamp2s;
-#endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
     // The **host** buffer for the channelId array
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
index 32f9be652d..19c241598e 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h
@@ -5,7 +5,6 @@
 
 #ifndef MemoryAccessDenominators_H
 #define MemoryAccessDenominators_H 1
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 
 #include "MemoryAccessGs.h"
 
@@ -28,5 +27,4 @@ namespace mg5amcCpu
 
 } // end namespace mg5amcGpu/mg5amcCpu
 
-#endif
 #endif // MemoryAccessDenominators_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
index 298007e9b9..b9543736e8 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h
@@ -5,7 +5,6 @@
 
 #ifndef MemoryAccessNumerators_H
 #define MemoryAccessNumerators_H 1
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 
 #include "MemoryAccessGs.h"
 
@@ -28,5 +27,4 @@ namespace mg5amcCpu
 
 } // end namespace mg5amcGpu/mg5amcCpu
 
-#endif
 #endif // MemoryAccessNumerators_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index 8b45069832..275e25faae 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -291,7 +291,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   // A base class encapsulating a memory buffer for numerators (of the multichannel single-diagram enhancement factors)
   typedef BufferBase<fptype> BufferNumerators;
 
@@ -307,12 +306,10 @@ namespace mg5amcCpu
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
   // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
-#endif
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   // A base class encapsulating a memory buffer for denominators (of the multichannel single-diagram enhancement factors)
   typedef BufferBase<fptype> BufferDenominators;
 
@@ -327,7 +324,6 @@ namespace mg5amcCpu
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
   // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
-#endif
 #endif
 
   //--------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h
index 342fc698c2..6967530cba 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/coloramps.h
@@ -8,10 +8,6 @@
 
 #include "CPPProcess.h"
 
-// Note: strictly speaking the check '#ifdef MGONGPU_SUPPORTS_MULTICHANNEL' is not needed here,
-// because coloramps.h is not included otherwise, but adding it does not harm and makes the code clearer
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
 
 namespace mgOnGpu
 {
@@ -66,6 +62,5 @@ namespace mgOnGpu
   };
 
 }
-#endif /* clang-format on */
 
 #endif // COLORAMPS_H
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index c32d0a2740..e830dbacbf 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,10 +6,6 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-// HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
-// There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
-%(mgongpu_supports_multichannel)s
-
 // Is this a GPU (CUDA, HIP) or CPU implementation?
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #define MGONGPUCPP_GPUIMPL cuda
@@ -130,15 +126,6 @@
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
 
-// Choose whether to enable or disable channelid debug printouts
-#ifndef MGONGPU_SUPPORTS_MULTICHANNEL
-#undef MGONGPU_CHANNELID_DEBUG // multichannel is not enabled
-#else
-// By default, do not hardcode, but allow this macro to be set from outside with e.g. -DMGONGPU_CHANNELID_DEBUG
-//#undef MGONGPU_CHANNELID_DEBUG // default
-////#define MGONGPU_CHANNELID_DEBUG 1
-#endif
-
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
 #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE_FLOAT
 #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE_DOUBLE or defined MGONGPU_FPTYPE_FLOAT
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index a52e12fc4c..574cf48b56 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -28,11 +28,9 @@
 #include "color_sum.h"
 #include "processConfig.h"
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
 #include "MemoryAccessNumerators.h"
 #include "coloramps.h"
-#endif
 
 #include <algorithm>
 #include <array>
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 95f7269b2c..e4a896138f 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -16,7 +16,6 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds
 #ifndef MGONGPUCPP_GPUIMPL
                                                ,
@@ -77,7 +76,6 @@ namespace mg5amcCpu
 #endif // MGONGPUCPP_GPUIMPL
     return channelId;
   }
-#endif // MGONGPU_SUPPORTS_MULTICHANNEL
 
   constexpr int np4 = CPPProcess::np4;     // dimensions of 4-momenta (E, px, py, pz)
   constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
@@ -201,7 +199,6 @@ namespace mg5amcCpu
   gpu_channelId( const unsigned int* allChannelIds )
   {
     unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
     // SCALAR channelId for the current event (CUDA)
     if( allChannelIds != nullptr )
@@ -212,7 +209,6 @@ namespace mg5amcCpu
       channelId = channelIds_sv;
       assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
     }
-#endif
     return channelId;
   }
 #endif
@@ -446,10 +442,8 @@ namespace mg5amcCpu
                        const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
                        const unsigned int* iflavorVec, // input: indices of the flavor combinations
                        fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                        fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
                        fptype_sv* allJamps,            // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
                        bool* isGoodHel,                // output: isGoodHel[ncomb] - host array
                        const int nevt )                // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -467,12 +461,8 @@ namespace mg5amcCpu
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
       gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
       // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering
       gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads );
-#else
-      gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, iflavorVec, allJamps, gpublocks * gputhreads );
-#endif
       gpuLaunchKernel( color_sum_kernel, gpublocks, gputhreads, allMEs, allJamps, nOneHel );
       gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
       //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
@@ -493,10 +483,8 @@ namespace mg5amcCpu
                        const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
                        const unsigned int* iflavorVec, // input: indices of the flavor combinations
                        fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                        fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
                        bool* isGoodHel,                // output: isGoodHel[ncomb] - host array
                        const int nevt )                // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
@@ -545,11 +533,7 @@ namespace mg5amcCpu
 #else
         cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
         calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry?
-#else
-        calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, ievt00 ); //maxtry?
-#endif /* clang-format on */
         color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -632,18 +616,15 @@ namespace mg5amcCpu
   __global__ void
   normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
                     const unsigned int* iflavorVec,
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                     fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
                     const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
                     bool storeChannelWeights,           // if true, compute final multichannel weights
                     bool mulChannelWeight,             // if true, multiply matrix element by channel weight
-#endif
                     const fptype globaldenom) /* clang-format on */
   {
     const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
     allMEs[ievt] /= (globaldenom * broken_symmetry_factor(iflavorVec[ievt]));
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const int nevt = gridDim.x * blockDim.x;
     if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0')
     {
@@ -666,7 +647,6 @@ namespace mg5amcCpu
         allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt];
       }
     }
-#endif
     return;
   }
 #endif
@@ -707,7 +687,6 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   __global__ void
   select_col_and_diag( int* allselcol,                    // output: color selection[nevt]
                        unsigned int* allDiagramIdsOut,    // output: sampled diagram ids
@@ -801,7 +780,6 @@ namespace mg5amcCpu
     }
     return;
   }
-#endif
 #endif
 
   //--------------------------------------------------------------------------
@@ -812,22 +790,18 @@ namespace mg5amcCpu
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const unsigned int* iflavorVec,     // input: indices of the flavor combinations
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
             const fptype* allrnddiagram,        // input: random numbers[nevt] for diagram sampling
-#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
             bool mulChannelWeight,              // if true, multiply channel weight to ME output
-#endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities
             fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
@@ -836,13 +810,11 @@ namespace mg5amcCpu
             const int gpublocks,                // input: cuda gpublocks
             const int gputhreads                // input: cuda gputhreads
 #else
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
             unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
             bool mulChannelWeight,              // if true, multiply channel weight to ME output
-#endif
             const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif
             ) /* clang-format on */
@@ -866,10 +838,8 @@ namespace mg5amcCpu
     //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
-#endif
 #endif
 
     // Start sigmaKin_lines
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index d9a6584097..0aeef2e758 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -52,10 +52,8 @@ namespace mg5amcCpu
                        const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
                        const unsigned int* iflavorVec, // input: index of the flavor combination
                        fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                        fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
                        fptype_sv* allJamps,            // output: jamp[ncolor*2*nevt]
                        bool* isGoodHel,                // output: isGoodHel[ncomb] - device array (GPU device implementation)
                        const int nevt );               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -65,10 +63,8 @@ namespace mg5amcCpu
                        const fptype* allcouplings,     // input: couplings[nevt*ndcoup*2]
                        const unsigned int* iflavorVec, // input: index of the flavor combination
                        fptype* allMEs,                 // output: allMEs[nevt], |M|^2 final_avg_over_helicities
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                        fptype* allNumerators,          // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,        // output: multichannel denominators[nevt], running_sum_over_helicities
-#endif
                        bool* isGoodHel,                // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
@@ -86,21 +82,17 @@ namespace mg5amcCpu
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const unsigned int* iflavorVec,     // input: index of the flavor combination
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
             const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
-#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
             fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
             bool mulChannelWeight,              // if true, multiply channel weight to ME output
-#endif
             fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
             fptype* ghelAllJamps,               // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
             fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
@@ -114,20 +106,16 @@ namespace mg5amcCpu
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const unsigned int* iflavorVec,     // input: index of the flavor combination
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
             const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
             const fptype* allrnddiagram,        // input: random numbers[nevt] for channel sampling
-#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
             fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
             fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
             unsigned int* allDiagramIdsOut,     // output: multichannel channelIds[nevt] (1 to #diagrams)
             bool mulChannelWeight,              // if true, multiply channel weight to ME output
-#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index aac7506855..e45608be31 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -10,7 +10,6 @@
       // *** COLOR CHOICE BELOW ***
 
       // Store the leading color flows for choice of color
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifndef MGONGPUCPP_GPUIMPL
       if( jamp2_sv ) // disable color choice if nullptr
       {
@@ -27,7 +26,6 @@
           atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( jamp_sv[icol] ) );
       }
 #endif /* clang-format on */
-#endif
 
       // *** PREPARE OUTPUT JAMPS ***
 #ifdef MGONGPUCPP_GPUIMPL
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index 290efed541..3e37994a8c 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -15,11 +15,9 @@
     const int nevt = gpublocks * gputhreads;
     gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
     gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) );
     gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
-#endif
     gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     // *** PART 0b - C++ ***
@@ -30,7 +28,6 @@
       fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv = fptype_sv{ 0 };
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
       fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
@@ -40,7 +37,6 @@
         numerators_sv[i] = fptype_sv{ 0 };
       }
       denominators_sv = fptype_sv{ 0 };
-#endif
     }
 #endif
 
@@ -58,14 +54,10 @@
     {
       const int ihel = cGoodHel[ighel];
       fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there)
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams;
       fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
       bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
       gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt );
-#else
-      gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, iflavorVec, hAllJamps, nevt );
-#endif
     }
     // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
     color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads );
@@ -74,15 +66,11 @@
     // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
     gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
     gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] );
 
     // Event-by-event random choice of color and diagram #402
     gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads );
-#else
-    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, iflavorVec, helcolDenominators[0] );
-#endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
 #else // CUDA OR C++
@@ -104,11 +92,7 @@
     // - private: give each thread its own copy, without initialising
     // - firstprivate: give each thread its own copy, and initialise with value from outside
 #define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #define _OMPLIST1 , allDenominators, allNumerators, allChannelIds, mgOnGpu::icolamp, mgOnGpu::channel2iconfig
-#else
-#define _OMPLIST1
-#endif
 #pragma omp parallel for default( none ) shared( _OMPLIST0 _OMPLIST1 )
 #undef _OMPLIST0
 #undef _OMPLIST1
@@ -131,13 +115,9 @@
       {
         const int ihel = cGoodHel[ighel];
         cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
         // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr;
         calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 );
-#else
-        calculate_jamps( ihel, allmomenta, allcouplings, iflavorVec, jamp_sv, ievt00 );
-#endif
         color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
@@ -182,7 +162,6 @@
         }
 #endif
       }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
       const int vecsize = 2 * neppV;
 #else
@@ -292,7 +271,6 @@
 #endif
         }
       }
-#endif // multichannel enabled (random color choice)
     }
     // *** END OF PART 1b - C++ (loop on event pages)
 
@@ -309,7 +287,6 @@
       fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
       fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
       MEs_sv /= (helcolDenominators[0] * broken_symmetry_factor(iflavorVec[ievt0]));
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
       {
         const unsigned int channelId = getChannelId( allChannelIds, ievt0, false );
@@ -319,7 +296,6 @@
         fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
         MEs_sv *= numerators_sv[channelId - 1] / denominators_sv;
       }
-#endif
       //for( int ieppV = 0; ieppV < neppV; ieppV++ )
       //{
       //  const unsigned int ievt = ipagV * neppV + ieppV;
diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 678eb8c34e..b14e2b7b71 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -17,9 +17,7 @@
 #include "MemoryBuffers.h"
 #include "RamboSamplingKernels.h"
 #include "RandomNumberKernels.h"
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "coloramps.h"
-#endif
 #include "epoch_process_id.h"
 
 #include <memory>
@@ -43,7 +41,6 @@ struct CUDA_CPU_TestBase : public TestDriverBase
   virtual bool useChannelIds() const = 0;
   // Set channelId array (in the same way for CUDA and CPU tests)
   static constexpr unsigned int warpSize = 32; // FIXME: add a sanity check in madevent that this is the minimum? (would need to expose this from cudacpp to madevent)
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
   static void setChannelIds( BufferChannelIds& hstChannelIds, std::size_t iiter )
   {
     static const char* debugC = getenv( "CUDACPP_RUNTEST_DEBUG" );
@@ -78,13 +75,6 @@ struct CUDA_CPU_TestBase : public TestDriverBase
         hstChannelIds[iWarp * warpSize + i] = channelId;
     }
   }
-#else
-  static void setChannelIds( BufferChannelIds& hstChannelIds, std::size_t /*iiter*/ )
-  {
-    // No-multichannel tests (set a DUMMY channelId=0 for all events: this is not used for ME comparison, but it does enter the comparison to reference results #976)
-    for( unsigned int i = 0; i < nevt; ++i ) hstChannelIds[i] = 0;
-  }
-#endif
 };
 
 #ifndef MGONGPUCPP_GPUIMPL
@@ -406,25 +396,19 @@ struct CUDATestMultiChannel : public CUDATest
 // Note: instantiate test2 first and test1 second to ensure that the channelid printout from the dtors comes from test1 first and test2 second
 #ifdef MGONGPUCPP_GPUIMPL
 // CUDA test drivers
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 CUDATestMultiChannel driver2( MG_EPOCH_REFERENCE_FILE_NAME );
 #define TESTID2( s ) s##_GPU_MULTICHANNEL
-#endif
 CUDATestNoMultiChannel driver1( MG_EPOCH_REFERENCE_FILE_NAME );
 #define TESTID1( s ) s##_GPU_NOMULTICHANNEL
 #else
 // CPU test drivers
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 CPUTestMultiChannel driver2( MG_EPOCH_REFERENCE_FILE_NAME );
 #define TESTID2( s ) s##_CPU_MULTICHANNEL
-#endif
 CPUTestNoMultiChannel driver1( MG_EPOCH_REFERENCE_FILE_NAME );
 #define TESTID1( s ) s##_CPU_NOMULTICHANNEL
 #endif
 // Madgraph tests
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 MadgraphTest mgTest2( driver2 );
-#endif
 MadgraphTest mgTest1( driver1 );
 // Instantiate Google test 1
 #define XTESTID1( s ) TESTID1( s )
@@ -436,7 +420,6 @@ TEST( XTESTID1( MG_EPOCH_PROCESS_ID ), compareMomAndME )
   mgTest1.CompareMomentaAndME( *this );
 }
 // Instantiate Google test 2
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #define XTESTID2( s ) TESTID2( s )
 TEST( XTESTID2( MG_EPOCH_PROCESS_ID ), compareMomAndME )
 {
@@ -445,5 +428,4 @@ TEST( XTESTID2( MG_EPOCH_PROCESS_ID ), compareMomAndME )
 #endif
   mgTest2.CompareMomentaAndME( *this );
 }
-#endif
 /* clang-format on */
diff --git a/PLUGIN/CUDACPP_OUTPUT/model_handling.py b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
index b1bc718d62..52211f504d 100644
--- a/PLUGIN/CUDACPP_OUTPUT/model_handling.py
+++ b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
@@ -1603,21 +1603,17 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
                    const unsigned int* iflavorVec,    // input: indices of the flavor combinations
 #ifdef MGONGPUCPP_GPUIMPL
                    fptype* allJamps,                  // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                    bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype* colAllJamp2s,              // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
-#endif
                    const int nevt                     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
                    cxtype_sv* allJamp_sv,             // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
                    bool storeChannelWeights,
                    fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
                    fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
                    fptype_sv* jamp2_sv,               // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#endif
                    const int ievt00                   // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
                    )
@@ -1631,10 +1627,8 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
     using F_ACCESS = DeviceAccessIflavorVec;      // non-trivial access: buffer includes all events
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
-#endif
 #else
     using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
@@ -1643,10 +1637,8 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
     using F_ACCESS = HostAccessIflavorVec;      // non-trivial access: buffer includes all events
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
-#endif
 #endif
     mgDebug( 0, __FUNCTION__ );
     //bool debug = true;
@@ -1798,10 +1790,6 @@ def edit_mgonGPU(self):
         replace_dict['nbhel'] = self.matrix_elements[0].get_helicity_combinations() # number of helicity combinations
         ###replace_dict['nwavefunc'] = self.matrix_elements[0].get_number_of_wavefunctions() # this is the correct P1-specific nwf, now in CPPProcess.h (#644)
         replace_dict['wavefuncsize'] = 6
-        if self.include_multi_channel:
-            replace_dict['mgongpu_supports_multichannel'] = '#define MGONGPU_SUPPORTS_MULTICHANNEL 1'
-        else:
-            replace_dict['mgongpu_supports_multichannel'] = '#undef MGONGPU_SUPPORTS_MULTICHANNEL'
         ff = open(pjoin(self.path, '..','..','src','mgOnGpuConfig.h'),'w')
         ff.write(template % replace_dict)
         ff.close()
@@ -2251,11 +2239,9 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       const fptype* momenta = allmomenta;
       const fptype* COUPs[nxcoup];
       for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams];
       fptype* denominators = allDenominators;
-#endif
 #else
       // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
       const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
@@ -2265,10 +2251,8 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
       for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
         COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams );
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
 #endif
       // Create an array of views over the Flavor Couplings
       FLV_COUPLING_VIEW flvCOUPs[nIPF];
@@ -2278,11 +2262,9 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
 
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
       fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators );
       fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
       // Scalar iflavor for the current event
       // for GPU it is an int
       // for SIMD it is also an int, since it is constant across the SIMD vector
@@ -2321,18 +2303,14 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
                     if id_amp in diag_to_config:
                         ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472
                         ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472
-                        res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
                         diagnum = diagram.get('number')
                         res.append("if( storeChannelWeights )")
                         res.append("{")
                         res.append("  numerators_sv[%i] += cxabs2( amp_sv[0] );" % (diagnum-1))
                         res.append("  denominators_sv += cxabs2( amp_sv[0] );")
                         res.append("}")
-                        res.append("#endif")
                 else:
-                    res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
                     res.append("// Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)")
-                    res.append("#endif")
                 for njamp, coeff in color[namp].items():
                     scoeff = PLUGIN_OneProcessExporter.coeff(*coeff) # AV
                     if scoeff[0] == '+' : scoeff = scoeff[1:]

From 9f9af1e454e8682d2a3d46fe8ee6dabc271a7c43 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:50:57 +0100
Subject: [PATCH 05/33] Fix order of args in umami - ok to cherry-pick

---
 .../CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
index 031e6ec89d..d482c04b61 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
@@ -30,10 +30,11 @@ namespace
   {
     bool is_good_hel[CPPProcess::ncomb];
     sigmaKin_getGoodHel(
-      momenta, couplings, flavor_indices, matrix_elements, numerators, denominators,
+      momenta, couplings, flavor_indices, matrix_elements,
 #ifdef MGONGPUCPP_GPUIMPL
       color_jamps,
 #endif
+      numerators, denominators,
       is_good_hel,
       count );
     sigmaKin_setGoodHel( is_good_hel );

From e626a45d5a26eadff9b7b830d2a5f61a2aaabec0 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:51:49 +0100
Subject: [PATCH 06/33] Set compilation command for MadSpace - cherry pick with
 caution

---
 madgraph/iolibs/template_files/mg7/madevent.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py
index a02c53af81..aeed03c474 100644
--- a/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/madgraph/iolibs/template_files/mg7/madevent.py
@@ -472,13 +472,15 @@ def __init__(self, process: MadgraphProcess, meta: dict, subproc_id: int):
         self.subproc_id = subproc_id
         self.multi_channel_data = None
 
-        api_path = self.meta["path"]
+        api_path = self.meta["me_path"]
+        subproc_path = self.meta["path"]
         if not os.path.isfile(api_path):
             cwd = os.getcwd()
-            api_dir = os.path.dirname(api_path)
-            logger.info(f"Compiling subprocess {api_dir}")
-            os.chdir(api_dir)
-            subprocess.run(["make"])
+            subproc_dir = os.path.dirname(subproc_path)
+            logger.info(f"Compiling subprocess {subproc_dir}")
+            os.chdir(subproc_path)
+            backend = self.process.run_card.get("device", "cppnone")
+            subprocess.run(["make", "-j", f"BACKEND={backend}"])
             os.chdir(cwd)
 
         self.incoming_masses = [

From a40db8444b6a9bce5e9db55ffb10a5e6191dfc0f Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:52:23 +0100
Subject: [PATCH 07/33] Use CUDACPP backends in the run card - cherry pick with
 caution

---
 madgraph/iolibs/template_files/mg7/run_card.toml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml
index 0af6eab2bf..fb7023e2df 100644
--- a/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -1,10 +1,6 @@
 [run]
 run_name = "run"
-device = "cpu" # options: cpu, cuda
-# options:
-#   -1 to choose automatically
-#   on x86: 1, 4, 8
-#   on Apple silicon: 1, 2
+device = "cpu" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
 simd_vector_size = -1
 thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
 output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe

From daa99389ab34a0819cb46e42209cbba89aeaff65 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:52:54 +0100
Subject: [PATCH 08/33] Update madgraph interface to run generate events when
 using mg7 - ok to cherry pick but seems to not work

---
 .../madgraph/iolibs/template_files/mg7/run_card.toml        | 6 +-----
 madgraph/interface/madgraph_interface.py                    | 6 +++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
index 0af6eab2bf..fb7023e2df 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -1,10 +1,6 @@
 [run]
 run_name = "run"
-device = "cpu" # options: cpu, cuda
-# options:
-#   -1 to choose automatically
-#   on x86: 1, 4, 8
-#   on Apple silicon: 1, 2
+device = "cpu" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
 simd_vector_size = -1
 thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
 output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe
diff --git a/madgraph/interface/madgraph_interface.py b/madgraph/interface/madgraph_interface.py
index a516978336..1f6dbd6171 100755
--- a/madgraph/interface/madgraph_interface.py
+++ b/madgraph/interface/madgraph_interface.py
@@ -1451,10 +1451,10 @@ def find_output_type(self, path):
             return 'pythia8'
         elif not os.path.isdir(os.path.join(path, 'SubProcesses')):
             raise self.InvalidCmd('%s : Not a valid directory' % path)
-        if os.path.isfile(pjoin(bin_path,'madevent')):
-            return 'madevent'
-        elif os.path.isfile(pjoin(card_path, 'run_card.toml')):
+        if os.path.isfile(pjoin(card_path, 'run_card.toml')):
             return 'mg7'
+        elif os.path.isfile(pjoin(bin_path,'madevent')):
+            return 'madevent'
         elif os.path.isdir(src_path):
             return 'standalone_cpp'
         elif os.path.isdir(mw_path):

From ab9c5a2501f61de1328028d8e0c61bd843152790 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:55:41 +0100
Subject: [PATCH 09/33] Modified output

---
 PLUGIN/CUDACPP_OUTPUT/model_handling.py |  2 +-
 PLUGIN/CUDACPP_OUTPUT/output.py         | 66 +++++++++++++++----------
 madgraph/iolibs/export_cpp.py           | 12 +++--
 3 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/model_handling.py b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
index 52211f504d..de6dc74631 100644
--- a/PLUGIN/CUDACPP_OUTPUT/model_handling.py
+++ b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
@@ -1737,7 +1737,7 @@ def generate_process_files(self):
         # NB: this relies on the assumption that cudacpp code is generated before madevent code
         files.ln(pjoin(self.path, 'cudacpp.mk'), self.path, 'makefile')
         # Add link to makefile_original.mk, PR #1052
-        files.ln(pjoin(self.path, '..', 'makefile_original.mk'), self.path, 'makefile_original.mk')
+        #files.ln(pjoin(self.path, '..', 'makefile_original.mk'), self.path, 'makefile_original.mk')
         # # Add symbolic links in the test directory
         # files.ln(pjoin(self.path + '/../../test', 'cudacpp_test.mk'), self.path + '/../../test', 'makefile')
         # # Add reference file in the test directory (if it exists for this process)
diff --git a/PLUGIN/CUDACPP_OUTPUT/output.py b/PLUGIN/CUDACPP_OUTPUT/output.py
index b2c669b1c1..74c8c9ec04 100644
--- a/PLUGIN/CUDACPP_OUTPUT/output.py
+++ b/PLUGIN/CUDACPP_OUTPUT/output.py
@@ -123,7 +123,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterMG7):
                                       s+'gpu/cudacpp_overlay.mk', s+'gpu/makefile_wrapper.mk',
                                       s+'gpu/umami.h', s+'gpu/umami.cc',
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
-                     'Cards': [s+'mg7/run_card.toml'] }
+                     'Cards': [s+'mg7/run_card.toml'],
+                     'test': [s+'gpu/cudacpp_test.mk']}
 
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
                     'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
@@ -176,10 +177,22 @@ def __init__(self, *args, **kwargs):
         return super().__init__(*args, **kwargs)
 
     # AV - overload the default version: create CMake directory, do not create lib directory
-    def copy_template_simd(self, model):
+    def copy_template(self, model):
         misc.sprint('Entering PLUGIN_ProcessExporter.copy_template (initialise the directory)')
-        super().copy_template_simd(model)
+        super().copy_template(model)
+        try: os.mkdir(self.dir_path)
+        except os.error as error: logger.warning(error.strerror + ' ' + self.dir_path)
         with misc.chdir(self.dir_path):
+            logger.info('Creating subdirectories in directory %s' % self.dir_path)
+            for d in ['src', 'Cards', 'SubProcesses', 'CMake', 'test', 'test/ref']: # AV - added CMake, test, test/ref; removed lib
+                try: os.mkdir(d)
+                except os.error as error: logger.warning(error.strerror + ' ' + os.path.join(self.dir_path,d))
+            # Write param_card
+            open(os.path.join('Cards','param_card.dat'), 'w').write(model.write_param_card())
+            # Copy files in various subdirectories
+            for key in self.from_template:
+                for f in self.from_template[key]:
+                    PLUGIN_export_cpp.cp(f, key) # NB this assumes directory key exists...
             # Copy src makefile
             if self.template_src_make:
                 makefile_src = self.read_template_file(self.template_src_make) % {'model': self.get_model_name(model.get('name'))}
@@ -208,7 +221,6 @@ def write_source_makefile(self, writer, model=None, default=None):
 
     # AV - add debug printouts (in addition to the default one from OM's tutorial)
     def generate_subprocess_directory(self, subproc_group, fortran_model, me=None):
-        # used only for standalone
         misc.sprint('Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory)')
         misc.sprint('  type(subproc_group)=%s'%type(subproc_group)) # e.g. madgraph.core.helas_objects.HelasMatrixElement
         misc.sprint('  type(fortran_model)=%s'%type(fortran_model)) # e.g. madgraph.iolibs.helas_call_writers.GPUFOHelasCallWriter
@@ -235,9 +247,9 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             # make -f makefile -f cudacpp_overlay.mk to include the overlay
             # and instead just use `make`, see #1052
             subprocesses_dir = pjoin(self.dir_path, "SubProcesses")
-            files.cp(pjoin(subprocesses_dir, "makefile"), pjoin(subprocesses_dir, "makefile_original.mk"))
+            # files.cp(pjoin(subprocesses_dir, "makefile"), pjoin(subprocesses_dir, "makefile_original.mk"))
             files.rm(pjoin(subprocesses_dir, "makefile"))
-            files.ln(pjoin(subprocesses_dir, "makefile_wrapper.mk"), subprocesses_dir, 'makefile')
+            files.ln(pjoin(subprocesses_dir, "cudacpp.mk"), subprocesses_dir, 'makefile')
 
             patch_coupl_write = r"""set -euo pipefail
 # Get last fields from lines starting with WRITE(*,2)
@@ -271,8 +283,6 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             self.add_madevent_plugin_fct() # Added by OM
         # do not call standard finalize since is this is already done...
         #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
-        else:
-            super().finalize()
 
     # AV (default from OM's tutorial) - overload settings and add a debug printout
     def modify_grouping(self, matrix_element):
@@ -299,15 +309,36 @@ def add_madevent_plugin_fct(self):
         files.cp(pjoin(plugin_path, 'launch_plugin.py'), pjoin(self.dir_path, 'bin', 'internal'))
         files.ln(pjoin(self.dir_path, 'lib'),  pjoin(self.dir_path, 'SubProcesses'))
 
-class MG7_SIMD_ProcessExporter(PLUGIN_ProcessExporter):
+#------------------------------------------------------------------------------------
+
+class PLUGIN_ProcessExporter_MadEvent(PLUGIN_ProcessExporter):
+    """ a class to include all tweak related to madevent and not related to standalone.
+        in practise this class is never called but only the SIMD or GPU related class"""
+
+    s = PLUGINDIR + '/madgraph/iolibs/template_files/'
+    # add template file/ linking only needed in the madevent mode and not in standalone
+    from_template = dict(PLUGIN_ProcessExporter.from_template)
+    from_template['SubProcesses'] = from_template['SubProcesses'] + [s+'gpu/fbridge_common.inc',
+                                      s+'gpu/counters.cc',
+                                      s+'gpu/ompnumthreads.cc']
+
+    to_link_in_P = PLUGIN_ProcessExporter.to_link_in_P + ['fbridge_common.inc', 'counters.cc','ompnumthreads.cc']
+
+#------------------------------------------------------------------------------------
+
+class MG7_SIMD_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
     lib_suffix = "cpp"
 
     @classmethod
     def change_output_args(cls, args, cmd):
         """ """
+        cmd._export_format = 'madevent'
+        cmd._export_plugin = export_v4.ProcessExporterME_MG7
         args.append('--hel_recycling=False')
+        args.append('--me_exporter=standalone_simd')
         # path relative to the process directory
         args.append('--simd=lib/libmg5amc_{processid_short}_' + cls.lib_suffix + ".so")
+        cmd._export_plugin.lib_format = 'lib/libmg5amc_{processid_short}_' + cls.lib_suffix + ".so"
         if 'vector_size' not in ''.join(args):
             args.append('--vector_size=16')
         if 'nb_wrap' not in ''.join(args):
@@ -335,23 +366,6 @@ class MG7_CUDA_ProcessExporter(MG7_GPU_ProcessExporter):
 class MG7_HIP_ProcessExporter(MG7_GPU_ProcessExporter):
     lib_suffix = "hip"
 
-#------------------------------------------------------------------------------------
-
-class PLUGIN_ProcessExporter_MadEvent(PLUGIN_ProcessExporter):
-    """ a class to include all tweak related to madevent and not related to standalone.
-        in practise this class is never called but only the SIMD or GPU related class"""
-
-    s = PLUGINDIR + '/madgraph/iolibs/template_files/'
-    # add template file/ linking only needed in the madevent mode and not in standalone
-    from_template = dict(PLUGIN_ProcessExporter.from_template)
-    from_template['SubProcesses'] = from_template['SubProcesses'] + [s+'gpu/fbridge_common.inc',
-                                      s+'gpu/counters.cc',
-                                      s+'gpu/ompnumthreads.cc']
-
-    to_link_in_P = PLUGIN_ProcessExporter.to_link_in_P + ['fbridge_common.inc', 'counters.cc','ompnumthreads.cc']
-
-#------------------------------------------------------------------------------------
-
 class SIMD_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
 
     # Default class for the run_card to use
diff --git a/madgraph/iolibs/export_cpp.py b/madgraph/iolibs/export_cpp.py
index f8c381db1e..b8d507cf14 100755
--- a/madgraph/iolibs/export_cpp.py
+++ b/madgraph/iolibs/export_cpp.py
@@ -697,7 +697,7 @@ def __init__(self, matrix_elements, cpp_helas_call_writer, process_string = "",
                 'diagrams': helas_objects.HelasDiagramList([diagram])})
 
 
-            self.include_multi_channel = False
+            self.include_multi_channel = True
     #===============================================================================
     # Global helper methods
     #===============================================================================
@@ -3246,10 +3246,12 @@ def copy_template_simd(self, model):
                     cp(f, key)
 
     def copy_template(self, model):
-        if self.matrix_element_path is None:
-            super().copy_template(model)
-        else:
-            self.copy_template_simd(model)
+        self.copy_template_simd(model)
+        # if self.matrix_element_path is None:
+        #     super().copy_template(model)
+        # else:
+        #     self.copy_template_simd(model)
+        # super().copy_template(model)
 
         # TODO: for now, we import the files from madgraph. eventually, we should copy
         # the files instead to allow for modification

From 6af23708b56d2d53c7bcb605942291952b1fb2fe Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:56:39 +0100
Subject: [PATCH 10/33] Very ugly hack (please don't judge) to get channel
 mappings

The idea is that we can't use standalone, or, at least, I was too
overwhelmed to understand how to get the channel mappings within
standalone. So I resort using madevent, but I needed a hack to write the
json file with the paths. This is just a hack, not supposed to go into
production.
---
 PLUGIN/CUDACPP_OUTPUT/output.py |  4 ++++
 madgraph/iolibs/export_v4.py    | 27 ++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/output.py b/PLUGIN/CUDACPP_OUTPUT/output.py
index 74c8c9ec04..0828065d29 100644
--- a/PLUGIN/CUDACPP_OUTPUT/output.py
+++ b/PLUGIN/CUDACPP_OUTPUT/output.py
@@ -351,9 +351,13 @@ class MG7_GPU_ProcessExporter(PLUGIN_ProcessExporter):
     @classmethod
     def change_output_args(cls, args, cmd):
         """ """
+        cmd._export_format = 'madevent'
+        cmd._export_plugin = export_v4.ProcessExporterME_MG7
         args.append('--hel_recycling=False')
+        args.append('--me_exporter=standalone_cuda')
         # path relative to the process directory
         args.append('--gpu=lib/libmg5amc_{processid_short}_' + cls.lib_suffix + ".so")
+        cmd._export_plugin.lib_format = 'lib/libmg5amc_{processid_short}_' + cls.lib_suffix + ".so"
         if 'vector_size' not in ''.join(args):
             args.append('--vector_size=32')
         if 'nb_wrap' not in ''.join(args):
diff --git a/madgraph/iolibs/export_v4.py b/madgraph/iolibs/export_v4.py
index 87e8a090c4..5c3c72c428 100755
--- a/madgraph/iolibs/export_v4.py
+++ b/madgraph/iolibs/export_v4.py
@@ -6414,6 +6414,9 @@ class ProcessExporterFortranMEGroup(ProcessExporterFortranME):
     #===========================================================================
     # generate_subprocess_directory
     #===========================================================================
+    def save_subproc(self, *args, **kwargs):
+        pass
+
     def generate_subprocess_directory(self, subproc_group,
                                          fortran_model,
                                          group_number,
@@ -6538,6 +6541,7 @@ def generate_subprocess_directory(self, subproc_group,
                     process_exporter_cpp.generate_process_files_madevent(proc_id=str(ime+1),
                                         config_map=subproc_group.get('diagram_maps')[ime], 
                                         subproc_number=group_number)
+                    self.save_subproc(matrix_element, process_exporter_cpp)
                     for file in second_exporter.to_link_in_P:
                         ln('../%s' % file)    
                 # second_exporter.write_matrix_element_madevent(ime,
@@ -10715,4 +10719,25 @@ def write_leshouche_file(self, writer, subproc_group):
         return True
 
 
-    
+from madgraph.iolibs.export_mg7 import get_subprocess_info
+import json
+class ProcessExporterME_MG7(ProcessExporterFortranMEGroup):
+    lib_format = ""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.process_info = []
+
+    def save_subproc(self, matrix_element, process_exporter_cpp):
+        me_lib_path = self.lib_format.format(processid_short = process_exporter_cpp.process_name)
+        proc_dir = os.path.relpath(process_exporter_cpp.path, self.dir_path)
+        self.process_info.append(get_subprocess_info(matrix_element, proc_dir, me_lib_path))
+
+    def finalize(self, *args, **kwargs):
+        super().finalize(*args, **kwargs)
+
+        file_name = os.path.normpath(os.path.join(
+            self.dir_path, "SubProcesses", "subprocesses.json"
+        ))
+        with open(file_name, 'w') as f:
+            json.dump(self.process_info, f)

From 1b369b5aeae7b7167102e3862cbdc7ac0d2ad18f Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:58:09 +0100
Subject: [PATCH 11/33] Rename library path to me_path and include also
 subprocess folder path - cherry-pick with caution (Theo's authorization
 missing)

---
 madgraph/iolibs/export_mg7.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/madgraph/iolibs/export_mg7.py b/madgraph/iolibs/export_mg7.py
index 060a1df0b7..0d783e25fa 100644
--- a/madgraph/iolibs/export_mg7.py
+++ b/madgraph/iolibs/export_mg7.py
@@ -5,7 +5,7 @@
 from madgraph.various.diagram_symmetry import find_symmetry, IdentifySGConfigTag
 
 
-def get_subprocess_info(matrix_element, proc_dir_name, lib_me_path):
+def get_subprocess_info(matrix_element, proc_dir, lib_me_path):
     model = matrix_element.get("processes")[0].get("model")
     amplitude = matrix_element.get("base_amplitude")
 
@@ -151,7 +151,8 @@ def get_subprocess_info(matrix_element, proc_dir_name, lib_me_path):
         "incoming": incoming,
         "outgoing": outgoing,
         "channels": channels,
-        "path": lib_me_path,
+        "me_path": lib_me_path,
+        "path": proc_dir,
         "flavors": flavors,
         "color_flows": color_flows,
         "pdg_color_types": pdg_color_types,

From ce2a577071af3c59766cbb6f5fc990cf30b308fd Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 16:59:57 +0100
Subject: [PATCH 12/33] Use full process name for the library, to interface
 better with MadSpace - ok to cherry-pick

---
 .../madgraph/iolibs/template_files/gpu/cudacpp.mk               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 9f1494512c..90369fc206 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -834,7 +834,7 @@ $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc $(BUILDDIR)/.b
 
 #-------------------------------------------------------------------------------
 
-processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+processid_short=$(shell basename $(CURDIR) | cut -d_ -f 2-)
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp

From 5fb2cbea250a560929c973dbddbb515d0ff0db26 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 17:00:29 +0100
Subject: [PATCH 13/33] Just a workaround to not use the broken simd vector
 size

---
 .../madgraph/iolibs/template_files/mg7/madevent.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
index a02c53af81..87d8c73a21 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
@@ -113,7 +113,7 @@ def load_cards(self) -> None:
             self.subprocess_data = json.load(f)
 
     def init_backend(self) -> None:
-        ms.set_simd_vector_size(self.run_card["run"]["simd_vector_size"])
+        # ms.set_simd_vector_size(self.run_card["run"]["simd_vector_size"])
         ms.set_thread_count(self.run_card["run"]["thread_pool_size"])
 
     def init_event_dir(self) -> None:

From c32f460395d1715dbb06b68aa3601ae137407531 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Tue, 10 Feb 2026 17:55:32 +0100
Subject: [PATCH 14/33] make launch command work

---
 madgraph/interface/madgraph_interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/madgraph/interface/madgraph_interface.py b/madgraph/interface/madgraph_interface.py
index 1f6dbd6171..281d70011c 100755
--- a/madgraph/interface/madgraph_interface.py
+++ b/madgraph/interface/madgraph_interface.py
@@ -1362,7 +1362,7 @@ def check_launch(self, args, options):
         if not args:
             if self._done_export:
                 mode = self.find_output_type(self._done_export[0])
-                if (self._done_export[1] == 'plugin' and mode in self._export_formats):
+                if mode == "mg7" or (self._done_export[1] == 'plugin' and mode in self._export_formats):
                     args.append(mode)
                     args.append(self._done_export[0])
                 elif self._done_export[1].startswith(mode):
@@ -7675,7 +7675,7 @@ def do_launch(self, line):
             ext_program = launch_ext.MWLauncher( self, args[1],
                                                  shell = isinstance(self, cmd.CmdShell),
                                                  options=self.options,**options)            
-        elif args[0] == 'mg7':
+        elif args[0][:3] == 'mg7':
             class ext_program:
                 @staticmethod
                 def run():

From 411fbb8b1858760989059176b4157381fdeaac7e Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Tue, 10 Feb 2026 23:16:04 +0100
Subject: [PATCH 15/33] flavor index bugfix

---
 .../madgraph/iolibs/template_files/gpu/umami.cc              | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
index d482c04b61..e77e53d2d0 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc
@@ -460,7 +460,10 @@ extern "C"
       color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5;
       diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5;
       g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195;
-      flavor_indices[i_event] = flavor_indices_in ? flavor_indices[i_event + offset] : 0;
+      flavor_indices[i_event] = flavor_indices_in ? flavor_indices_in[i_event + offset] : 0;
+    }
+    for ( std::size_t i_event = count; i_event < rounded_count; ++i_event ) {
+      flavor_indices[i_event] = 0;
     }
     computeDependentCouplings( g_s.data(), couplings.data(), rounded_count );
 

From 04c3f00fe0596754e495ed74a4303c5b5b9baceb Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Tue, 10 Feb 2026 17:55:37 +0100
Subject: [PATCH 16/33] Fix issue with double defined variable in GPU case - ok
 to cherry pick

---
 .../iolibs/template_files/gpu/process_function_definitions.inc  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index e4a896138f..2e2b0537e4 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -157,8 +157,6 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
   __device__ __constant__ int dcNGoodHel;
   __device__ __constant__ int dcGoodHel[ncomb];
   __device__ __constant__ short cFlavors[nmaxflavor][npar];

From 9d77de4ed983ee724df069b8a4fa08433ae8db0a Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Wed, 11 Feb 2026 10:44:38 +0100
Subject: [PATCH 17/33] Modify FLV coupling view handling, given sometimes the
 size is zero - ok to cherry pick

---
 .../aloha/template_files/gpu/helas.h          | 40 ++++++++++++++-----
 PLUGIN/CUDACPP_OUTPUT/model_handling.py       |  4 +-
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
index 73621b293a..4e19ae660d 100644
--- a/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
+++ b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
@@ -32,19 +32,37 @@
 
   struct FLV_COUPLING_VIEW {
 
-      const int* partner1;
-      const int* partner2;
-      const fptype* value;
+      const int* const partner1;
+      const int* const partner2;
+      const fptype* const value;
 
-      __host__ __device__ FLV_COUPLING_VIEW() = default;
       __host__ __device__
-      FLV_COUPLING_VIEW(const int* partner1_base,
-                        const int* partner2_base,
-                        const fptype* value_base,
-                        const int n)
-      : partner1(partner1_base + n),
-        partner2(partner2_base + n),
-        value(value_base + 2*n) {}
+      FLV_COUPLING_VIEW(const int* p1, const int* p2, const fptype* v)
+      : partner1(p1), partner2(p2), value(v) {}
+  };
+
+  template<int SIZE, int STRIDE>
+  class FLV_COUPLING_ARRAY {
+
+      static_assert(SIZE >= 0, "flvCOUPs SIZE must be non-negative");
+      static_assert(STRIDE > 0, "flvCOUPs STRIDE must be positive");
+      const int* const partner1;
+      const int* const partner2;
+      const fptype* const value;
+
+    public:
+      __host__ __device__
+      FLV_COUPLING_ARRAY(const int* p1, const int* p2, const fptype* v)
+      : partner1(p1), partner2(p2), value(v) {}
+
+      __host__ __device__
+      FLV_COUPLING_VIEW operator[](const int i) const {
+        return FLV_COUPLING_VIEW{
+          partner1 + i*STRIDE,
+          partner2 + i*STRIDE,
+          value + i*2*STRIDE
+        };
+      }
   };
 
   //--------------------------------------------------------------------------
diff --git a/PLUGIN/CUDACPP_OUTPUT/model_handling.py b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
index de6dc74631..27f21de163 100644
--- a/PLUGIN/CUDACPP_OUTPUT/model_handling.py
+++ b/PLUGIN/CUDACPP_OUTPUT/model_handling.py
@@ -2255,9 +2255,7 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
       fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
       // Create an array of views over the Flavor Couplings
-      FLV_COUPLING_VIEW flvCOUPs[nIPF];
-      for ( int idflv = 0; idflv < nIPF; idflv++ )
-        flvCOUPs[idflv] = FLV_COUPLING_VIEW{ cIPF_partner1, cIPF_partner2, cIPF_value, idflv * nMF };
+      FLV_COUPLING_ARRAY<nIPF, nMF> flvCOUPs{ cIPF_partner1, cIPF_partner2, cIPF_value };
 
       // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
       for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }

From 2f7b9799039beb3f1526474e5fc2343d538a24b9 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Wed, 11 Feb 2026 10:45:30 +0100
Subject: [PATCH 18/33] Remove warning for host default ALOHAOBJ constructor

---
 PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
index 4e19ae660d..dbe04f2e85 100644
--- a/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
+++ b/PLUGIN/CUDACPP_OUTPUT/aloha/template_files/gpu/helas.h
@@ -25,7 +25,7 @@
       fptype * w;
       int flv_index;
 
-      __host__ __device__ ALOHAOBJ() = default;
+      __host__ __device__ ALOHAOBJ() {}
       __host__ __device__ ALOHAOBJ(fptype_sv * pvec_sv, cxtype_sv * w_sv, int flv = -1)
           : pvec(pvec_sv), w(reinterpret_cast<fptype*>(w_sv)), flv_index(flv) {}
   };

From a9a98c9ee54711bae8af273789bb0b8286397386 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Wed, 11 Feb 2026 12:03:41 +0100
Subject: [PATCH 19/33] change some run card defaults

---
 .../madgraph/iolibs/template_files/mg7/run_card.toml        | 6 +++---
 madgraph/iolibs/template_files/mg7/run_card.toml            | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
index fb7023e2df..1cb66c081a 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -3,7 +3,7 @@ run_name = "run"
 device = "cpu" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
 simd_vector_size = -1
 thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
-output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe
+output_format = "lhe" # options: compact_npy, lhe_npy, lhe
 verbosity = "pretty" # options: silent, pretty, log
 dummy_matrix_element = false
 
@@ -112,8 +112,8 @@ buffered_steps = 0
 uniform_channel_ratio = 0.1
 integration_history_length = 1000
 max_stored_channel_weights = 100
-channel_dropping_threshold = 0.001
-channel_dropping_interval = 300
+channel_dropping_threshold = 0.01
+channel_dropping_interval = 200
 drop_zero_integrands = true
 batch_size_threshold = 0.5
 channel_grouping_mode = "uniform" # options: none, uniform, learned
diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml
index fb7023e2df..1cb66c081a 100644
--- a/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -3,7 +3,7 @@ run_name = "run"
 device = "cpu" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
 simd_vector_size = -1
 thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
-output_format = "compact_npy" # options: compact_npy, lhe_npy, lhe
+output_format = "lhe" # options: compact_npy, lhe_npy, lhe
 verbosity = "pretty" # options: silent, pretty, log
 dummy_matrix_element = false
 
@@ -112,8 +112,8 @@ buffered_steps = 0
 uniform_channel_ratio = 0.1
 integration_history_length = 1000
 max_stored_channel_weights = 100
-channel_dropping_threshold = 0.001
-channel_dropping_interval = 300
+channel_dropping_threshold = 0.01
+channel_dropping_interval = 200
 drop_zero_integrands = true
 batch_size_threshold = 0.5
 channel_grouping_mode = "uniform" # options: none, uniform, learned

From 6f07707435d1750457f78c94421aeade23d43f2b Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Wed, 11 Feb 2026 12:06:36 +0100
Subject: [PATCH 20/33] Change default for device (backend) parameter in run
 card

---
 madgraph/iolibs/template_files/mg7/run_card.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/madgraph/iolibs/template_files/mg7/run_card.toml b/madgraph/iolibs/template_files/mg7/run_card.toml
index 1cb66c081a..ebe37f2191 100644
--- a/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -1,6 +1,6 @@
 [run]
 run_name = "run"
-device = "cpu" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
+device = "cppauto" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
 simd_vector_size = -1
 thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
 output_format = "lhe" # options: compact_npy, lhe_npy, lhe

From 955e7eb03a25e8a315e692b53bf76ecbe76ea485 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Wed, 11 Feb 2026 13:59:17 +0100
Subject: [PATCH 21/33] fix device selection

---
 .../madgraph/iolibs/template_files/mg7/run_card.toml      | 2 +-
 madgraph/iolibs/template_files/mg7/madevent.py            | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
index 1cb66c081a..ebe37f2191 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/run_card.toml
@@ -1,6 +1,6 @@
 [run]
 run_name = "run"
-device = "cpu" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
+device = "cppauto" # cuda, hip, cpp, cppnone, cppsse4, cppavx2, cpp512y, cpp512z, cppauto
 simd_vector_size = -1
 thread_pool_size = -1 # -1 sets count automatically based on number of CPUs
 output_format = "lhe" # options: compact_npy, lhe_npy, lhe
diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py
index 6de8e677c5..03915441de 100644
--- a/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/madgraph/iolibs/template_files/mg7/madevent.py
@@ -263,14 +263,12 @@ def init_generator_config(self) -> None:
 
     def init_context(self) -> None:
         device_name = self.run_card["run"]["device"]
-        if device_name == "cpu":
-            device = ms.cpu_device()
-        elif device_name == "cuda":
+        if device_name == "cuda":
             device = ms.cuda_device()
         elif device_name == "hip":
             device = ms.hip_device()
         else:
-            raise ValueError("Unknown device")
+            device = ms.cpu_device()
         self.context = ms.Context(device)
 
     def init_subprocesses(self) -> None:
@@ -597,7 +595,7 @@ def __init__(self, process: MadgraphProcess, meta: dict, subproc_id: int):
             subproc_dir = os.path.dirname(subproc_path)
             logger.info(f"Compiling subprocess {subproc_dir}")
             os.chdir(subproc_path)
-            backend = self.process.run_card.get("device", "cppnone")
+            backend = self.process.run_card["run"]["device"]
             subprocess.run(["make", "-j", f"BACKEND={backend}"])
             os.chdir(cwd)
 

From 5de3f5fb3b149d381b6803541293168a40b6fd96 Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Wed, 11 Feb 2026 14:50:19 +0100
Subject: [PATCH 22/33] added some instructions for installation and usage

---
 tutorial.md | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tutorial.md

diff --git a/tutorial.md b/tutorial.md
new file mode 100644
index 0000000000..1b1ecddc7c
--- /dev/null
+++ b/tutorial.md
@@ -0,0 +1,61 @@
+## MadGraph 7 tutorial
+
+### Installation
+
+First check out the MadGraph7 repository:
+```sh
+git clone git@github.com:MadGraphTeam/MadGraph7.git
+cd MadGraph7
+```
+
+Then install the pre-compiled `madspace` package using
+```sh
+pip install madspace
+```
+If the binary distribution does not work for you, you can also build it from source
+```sh
+pip install scikit_build_core
+cd madspace
+pip install .
+```
+
+If you want to try out the `madboard` web interface, you can install it using
+```sh
+pip install madboard
+```
+
+### CPU usage
+
+Open the MadGraph shell with `bin/mg5_aMC`. Then use
+```
+generate g g > t t~ g
+output mg7_simd your_process_name
+launch
+```
+After typing launch, you can choose to edit the `run_card.toml`. Editing parameters using the `set` command is not yet possible.
+
+### CUDA usage
+
+Open the MadGraph shell with `bin/mg5_aMC`. Then use
+```
+generate g g > t t~ g
+output mg7_cuda your_process_name
+launch
+```
+You then have to edit a few entries in the `run_card.toml` by hand:
+```toml
+# in section [run]
+device = "cuda"
+thread_pool_size = 1
+
+# in section [generation]
+batch_size = 64000
+```
+
+### MadBoard
+
+To use MadBoard, go to the directory with you process folders and run
+```sh
+madboard
+```
+This will start a server and open MadBoard in your browser.

From 2a7b01f81de0b43ec98dfbb5aa33ba01f64a5205 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <d.massaro.26@gmail.com>
Date: Wed, 11 Feb 2026 17:10:22 +0100
Subject: [PATCH 23/33] Fix LHAPDF path when it is not absolute

---
 .../madgraph/iolibs/template_files/mg7/madevent.py        | 8 ++++++--
 madgraph/iolibs/template_files/mg7/madevent.py            | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
index 87d8c73a21..139630111a 100644
--- a/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/PLUGIN/CUDACPP_OUTPUT/madgraph/iolibs/template_files/mg7/madevent.py
@@ -14,15 +14,19 @@
     # for versions before 3.11
     import pip._vendor.tomli as tomllib
 
+MG_ROOT = os.path.abspath(os.path.realpath(__file__)).replace(os.path.join("madgraph", "iolibs", "template_files", "mg7", "madevent.py"), "")
+cwd = os.getcwd()
+os.chdir(MG_ROOT)
 if "LHAPDF_DATA_PATH" in os.environ:
-    PDF_PATH = os.environ["LHAPDF_DATA_PATH"]
+    PDF_PATH = os.path.abspath(os.environ["LHAPDF_DATA_PATH"])
 else:
     try:
         import lhapdf
         lhapdf.setVerbosity(0)
-        PDF_PATH = lhapdf.paths()[0]
+        PDF_PATH = os.path.abspath(lhapdf.paths()[0])
     except ImportError:
         raise RuntimeError("Can't load lhapdf module. Please set LHAPDF_DATA_PATH manually")
+os.chdir(cwd)
 
 import madspace as ms
 from models.check_param_card import ParamCard
diff --git a/madgraph/iolibs/template_files/mg7/madevent.py b/madgraph/iolibs/template_files/mg7/madevent.py
index 03915441de..f26fd35819 100644
--- a/madgraph/iolibs/template_files/mg7/madevent.py
+++ b/madgraph/iolibs/template_files/mg7/madevent.py
@@ -14,15 +14,19 @@
     # for versions before 3.11
     import pip._vendor.tomli as tomllib
 
+MG_ROOT = os.path.abspath(os.path.realpath(__file__)).replace(os.path.join("madgraph", "iolibs", "template_files", "mg7", "madevent.py"), "")
+cwd = os.getcwd()
+os.chdir(MG_ROOT)
 if "LHAPDF_DATA_PATH" in os.environ:
-    PDF_PATH = os.environ["LHAPDF_DATA_PATH"]
+    PDF_PATH = os.path.abspath(os.environ["LHAPDF_DATA_PATH"])
 else:
     try:
         import lhapdf
         lhapdf.setVerbosity(0)
-        PDF_PATH = lhapdf.paths()[0]
+        PDF_PATH = os.path.abspath(lhapdf.paths()[0])
     except ImportError:
         raise RuntimeError("Can't load lhapdf module. Please set LHAPDF_DATA_PATH manually")
+os.chdir(cwd)
 
 import madspace as ms
 from models.check_param_card import ParamCard

From 5b3d748cd5a4d78212b8cd0d062fa60b212d63c1 Mon Sep 17 00:00:00 2001
From: Daniele Massaro <66123362+Qubitol@users.noreply.github.com>
Date: Wed, 11 Feb 2026 17:41:59 +0100
Subject: [PATCH 24/33] Update tutorial.md with PDF instructions

---
 tutorial.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tutorial.md b/tutorial.md
index 1b1ecddc7c..753fdd6b93 100644
--- a/tutorial.md
+++ b/tutorial.md
@@ -19,6 +19,22 @@ cd madspace
 pip install .
 ```
 
+For Parton Distribution Functions (PDFs), you need to either:
+- install `lhapdf6` through the MadGraph CLI
+- install the [python package `lhapdf-management`](https://pypi.org/project/lhapdf-management/) via `pip`:
+  ```bash
+  pip install lhapdf-management
+  ```
+  and then set, e.g., `LHAPDF_DATA_PATH` environment variable:
+  ```bash
+  mkdir -p pdfs
+  export LHAPDF_DATA_PATH="$(pwd)/pdfs"
+  ```
+  finally, install the PDF *NNPDF23_lo_as_0130_qed*
+  ```bash
+  lhapdf-management install NNPDF23_lo_as_0130_qed
+  ```
+
 If you want to try out the `madboard` web interface, you can install it using
 ```sh
 pip install madboard

From 2545a89e5be565d3668743cb5a4bc7b1700a907f Mon Sep 17 00:00:00 2001
From: Theo Heimel <info@theovention.de>
Date: Thu, 12 Feb 2026 10:13:06 +0100
Subject: [PATCH 25/33] update tutorial

---
 tutorial.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorial.md b/tutorial.md
index 753fdd6b93..2d51944447 100644
--- a/tutorial.md
+++ b/tutorial.md
@@ -4,7 +4,7 @@
 
 First check out the MadGraph7 repository:
 ```sh
-git clone git@github.com:MadGraphTeam/MadGraph7.git
+git clone -b heidelberg26 git@github.com:MadGraphTeam/MadGraph7.git
 cd MadGraph7
 ```
 

From e0033d4a8b8cd935e2d407acef9556b978ed28f1 Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 11:32:58 +0200
Subject: [PATCH 26/33] two test processes

---
 tests/processes/pp_ttx/launch.mg5  | 6 ++++++
 tests/processes/pp_ttx/output.mg5  | 2 ++
 tests/processes/pp_ttxj/launch.mg5 | 6 ++++++
 tests/processes/pp_ttxj/output.mg5 | 2 ++
 4 files changed, 16 insertions(+)
 create mode 100644 tests/processes/pp_ttx/launch.mg5
 create mode 100644 tests/processes/pp_ttx/output.mg5
 create mode 100644 tests/processes/pp_ttxj/launch.mg5
 create mode 100644 tests/processes/pp_ttxj/output.mg5

diff --git a/tests/processes/pp_ttx/launch.mg5 b/tests/processes/pp_ttx/launch.mg5
new file mode 100644
index 0000000000..2acf6a78e1
--- /dev/null
+++ b/tests/processes/pp_ttx/launch.mg5
@@ -0,0 +1,6 @@
+launch PROC_pp_ttx
+set vector_size 32
+set nevents 25k
+set sde_strategy 1
+set gridpack True
+
diff --git a/tests/processes/pp_ttx/output.mg5 b/tests/processes/pp_ttx/output.mg5
new file mode 100644
index 0000000000..0299c86959
--- /dev/null
+++ b/tests/processes/pp_ttx/output.mg5
@@ -0,0 +1,2 @@
+generate p p > t t~
+output madevent_simd PROC_pp_ttx
diff --git a/tests/processes/pp_ttxj/launch.mg5 b/tests/processes/pp_ttxj/launch.mg5
new file mode 100644
index 0000000000..6be9aa7461
--- /dev/null
+++ b/tests/processes/pp_ttxj/launch.mg5
@@ -0,0 +1,6 @@
+launch PROC_pp_ttxj
+set vector_size 32
+set nevents 25k
+set sde_strategy 1
+set gridpack True
+
diff --git a/tests/processes/pp_ttxj/output.mg5 b/tests/processes/pp_ttxj/output.mg5
new file mode 100644
index 0000000000..1a934c01e6
--- /dev/null
+++ b/tests/processes/pp_ttxj/output.mg5
@@ -0,0 +1,2 @@
+generate p p > t t~ j
+output madevent_simd PROC_pp_ttxj

From 68bdc20b72185964fdd8ba3a84dc290881a3cf6f Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 11:33:52 +0200
Subject: [PATCH 27/33] initial commit github->github

---
 .github/workflows/check-formatting.sh |  45 ++++++++
 .github/workflows/code2gitlab.yml     | 156 ++++++++++++++++++++++++++
 .github/workflows/update_readme.py    |  77 +++++++++++++
 3 files changed, 278 insertions(+)
 create mode 100755 .github/workflows/check-formatting.sh
 create mode 100644 .github/workflows/code2gitlab.yml
 create mode 100644 .github/workflows/update_readme.py

diff --git a/.github/workflows/check-formatting.sh b/.github/workflows/check-formatting.sh
new file mode 100755
index 0000000000..0c2effbbb9
--- /dev/null
+++ b/.github/workflows/check-formatting.sh
@@ -0,0 +1,45 @@
+if [ ! -d "src" ] && [ ! -d "SubProcesses" ]; then
+  echo "Error: Neither 'src' nor 'SubProcesses' directory exists, script needs to be run from the root directory of a generated process" >&2
+  exit 255
+fi
+
+count=0
+total=0
+files_needing_format=()
+patch_file="format-changes.patch"
+num_file="files2format"
+
+# clean patch file if it exists
+> "$patch_file"
+> "$num_file"
+
+echo ""
+echo "Checking formatting with clang-format..."
+echo ""
+
+while IFS= read -r file; do
+  ((total++))
+  if ! clang-format --dry-run --Werror "$file" 2>/dev/null; then
+    echo "=== $file needs formatting ==="
+    clang-format "$file" | diff -u "$file" - >> "$patch_file"
+    echo "" >> "$patch_file"
+    ((count++))
+    files_needing_format+=("$file")
+  fi
+done < <(find src SubProcesses -type f \( -name "*.cc" -o -name "*.h" \) 2>/dev/null)
+
+
+echo ""
+echo "Files needing formatting: $count"
+echo "Total files checked: $total"
+
+if [ $count -gt 0 ]; then
+  echo "Detailed patch saved to '$patch_file'"
+  echo "If this scrpt is run in the CI workflow, detailed patches are provided as comment on the PR." 
+else
+  echo "All files are properly formatted."
+  rm "$patch_file"  
+fi
+echo ""
+
+echo $count | tr -d \\n >> $num_file
diff --git a/.github/workflows/code2gitlab.yml b/.github/workflows/code2gitlab.yml
new file mode 100644
index 0000000000..466444cb5d
--- /dev/null
+++ b/.github/workflows/code2gitlab.yml
@@ -0,0 +1,156 @@
+name: Generate code, check format and push to generated processes repo
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  generate-and-check:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        process: [pp_ttx, pp_ttxj]
+
+    env:
+      PROC_DIR: PROC_${{ matrix.process }}
+    
+    steps:
+      - name: Checkout codegen repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: true
+      
+      - name: Run code generator
+        run: |
+          cd ${GITHUB_WORKSPACE}
+          if [ ! -e MG5aMC/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT ] ; then pushd MG5aMC/mg5amcnlo/PLUGIN/ && ln -s ../../../epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT && popd ; fi
+          ./MG5aMC/mg5amcnlo/bin/mg5_aMC test/processes/${{ matrix.process }}/output.mg5
+      
+      - name: Check code format with clang-format
+        id: clang-format-check
+        run: |
+          chmod +x ${GITHUB_WORKSPACE}/.github/workflows/check-formatting.sh
+          cd ${GITHUB_WORKSPACE}/${{ env.PROC_DIR }}
+          ${GITHUB_WORKSPACE}/.github/workflows/check-formatting.sh
+          files2format=$(cat files2format)
+          echo "files2format=$files2format" >> $GITHUB_OUTPUT
+
+          if [ "$files2format" -eq 0 ]; then
+            echo "✅ clang-format check passed."
+          else
+            echo "❌ clang-format check failed."
+          fi
+
+      - name: Generate GitHub App token
+        if: ${{ steps.clang-format-check.outputs.files2format == '0' }}
+        id: app-token
+        uses: actions/create-github-app-token@v1
+        with:
+          app-id: ${{ secrets.GH_APP_ID }}
+          private-key: ${{ secrets.GH_APP_PRIVATE_KEY }}
+          repositories: madgraph4gpu-generated-processes
+
+      - name: Commit to target repository
+        if: ${{ steps.clang-format-check.outputs.files2format == '0' }}
+        env:
+          TARGET_REPO: roiser/madgraph4gpu-generated-processes
+        run: |
+          # Configure git
+
+          git config --global user.name "github-actions"
+          git config --global user.email "github-actions@users.noreply.github.com"
+          
+          # Clone target repo
+          cd ${GITHUB_WORKSPACE}
+          git clone https://x-access-token:${{ steps.app-token.outputs.token }}@github.com/${TARGET_REPO}.git
+          cd ${GITHUB_WORKSPACE}/madgraph4gpu-generated-processes
+          
+          # Create branch based on PR number
+          BRANCH_NAME="codegen-pr-${{ github.event.pull_request.number }}"
+          git checkout -b ${BRANCH_NAME}
+          
+          # Copy generated files (adjust paths as needed)
+          cp -r ${GITHUB_WORKSPACE}/${{ env.PROC_DIR }} .
+
+          # Commit and push
+          git add .
+          git commit -m "Code generated from PR #${{ github.event.pull_request.number }}" \
+                     -m "Source PR: ${{ github.event.pull_request.html_url }}"
+
+          git push -f origin ${BRANCH_NAME}
+          
+          echo "✅ Pushed to ${TARGET_REPO} on branch ${BRANCH_NAME}"
+      
+      - name: Set up Python
+        if: ${{ steps.commit_to_target_repository.outcome == 'success' }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Update README.md
+        if: ${{ steps.commit_to_target_repository.outcome == 'success' }}
+        run: |
+          cd ${GITHUB_WORKSPACE}/madgraph4gpu-generated-processes
+          git checkout main
+
+          python ${GITHUB_WORKSPACE}/.github/workflows/update_readme.py \
+            --title "${{ github.event.pull_request.title }}" \
+            --url "${{ github.event.pull_request.html_url }}" \
+            --pr_num "${{ github.event.pull_request.number }}" \
+            --created "${{ github.event.pull_request.created_at }}" \
+            --last_updated "${{ github.event.pull_request.updated_at }}" \
+            --wip "${{ github.event.pull_request.draft }}" \
+            --num_commits "${{ github.event.pull_request.commits }}" \
+            --author "${{ github.event.pull_request.user.login }}"
+
+          git add .
+          git commit -m "Update README for PR #${{ github.event.pull_request.number }}"
+          git push -f origin main
+          
+          echo "✅ Pushed to ${TARGET_REPO} on branch main"
+
+      - name: Read patch file
+        if: ${{ steps.clang-format-check.outputs.files2format != '0' }}
+        run: |
+          {
+            echo "patch_file_content<<EOF"
+            cat ${{ env.PROC_DIR }}/format-changes.patch
+            echo "EOF"
+          } >> $GITHUB_OUTPUT
+
+      - name: Comment on PR with format issues
+        if: ${{ steps.clang-format-check.outputs.files2format != '0' }}
+        uses: actions/github-script@v7
+        env:
+          FILES2FORMAT: ${{ steps.clang-format-check.outputs.files2format }}
+        with:
+          script: |
+
+            const fs = require('fs');
+            const filesToFormat = parseInt(process.env.FILES2FORMAT);
+            const patchContent = fs.readFileSync(`${process.env.PROC_DIR}/format-changes.patch`, 'utf8');
+
+            const comment = `## ❌ Code Format Check Failed
+            
+            The generated code does not conform to clang-format rules. 
+
+            Please update your code generator to produce properly formatted code.
+            ${filesToFormat} files need formatting.
+            
+            Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes. 
+                  The report below is for process **${process.env.PROC_DIR}** with backend **${process.env.BACKEND}**.
+            
+            See attached patch for details:`;
+            
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: `${comment} \n \`\`\` \n ${patchContent} \n \`\`\``
+            });
+
+      - name: Fail if format check failed
+        if: ${{ steps.clang-format-check.outputs.files2format != '0' }}
+        run: exit 1
diff --git a/.github/workflows/update_readme.py b/.github/workflows/update_readme.py
new file mode 100644
index 0000000000..21c72607e0
--- /dev/null
+++ b/.github/workflows/update_readme.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+import json
+import optparse
+
+class UpdateReadme:
+    def __init__(self, title, url, pr_num, created, last_updated, wip, num_commits, author):
+        self.title = title
+        self.url = url
+        self.pr_num = pr_num
+        self.created = created
+        self.last_updated = last_updated
+        self.wip = wip
+        self.num_commits = num_commits
+        self.author = author
+        self.remote_repo_url = "https://github.com/roiser/madgraph4gpu-generated-processes/"
+
+    def update_json(self):
+        fh = open('readme_data.json', 'r')
+        data = json.load(fh)
+        fh.close()
+
+        if self.pr_num not in data:
+            new_entry = {
+                "title": self.title,
+                "branch": self.remote_repo_url + "tree/codegen-pr-" + str(self.pr_num),
+                "url": self.url,
+                "created": self.created,
+                "last_updated": self.last_updated,
+                "wip": self.wip,
+                "num_commits": self.num_commits,
+                "author": self.author
+                }
+            data[self.pr_num] = new_entry
+        else:
+            data[self.pr_num]['last_updated'] = self.last_updated
+            data[self.pr_num]['wip'] = self.wip
+            data[self.pr_num]['num_commits'] = self.num_commits
+            data[self.pr_num]['title'] = self.title
+
+        fh = open('readme_data.json', 'w')
+        json.dump(data, fh, indent=4)
+        fh.close()
+
+    def write_readme(self):
+        fh = open('readme_data.json', 'r')
+        data = json.load(fh)
+        fh.close()
+
+        sorted_prs = sorted(data.items(), key=lambda x: x[1]['last_updated'], reverse=True)
+
+        with open('README.md', 'w') as fh:
+            fh.write("# Pull Request Summary\n\n")
+            fh.write("| PR Number | Local Branch | Title | WIP | # Commits | Author | Created | Last Update |\n")
+            fh.write("|-----------|--------------|-------|-----|-----------|--------|---------|-------------|\n")
+            for pr_num, details in sorted_prs:
+                fh.write(f"| [{pr_num}]({details['url']}) | [{details['branch'].split('/')[-1]}]({details['branch']}) | {details['title']} | {details['wip']} | {details['num_commits']} | {details['author']} | {details['created']} | {details['last_updated']} |\n")
+
+    def run(self):
+        self.update_json()
+        self.write_readme()
+        
+if __name__ == "__main__":
+    parser = optparse.OptionParser()
+    parser.add_option('--title', dest='title', help='PR title')
+    parser.add_option('--url', dest='url', help='PR URL')
+    parser.add_option('--pr_num', dest='pr_num', help='PR number')
+    parser.add_option('--created', dest='created', help='PR creation date')
+    parser.add_option('--last_updated', dest='last_updated', help='PR last updated date')
+    parser.add_option('--wip', dest='wip', help='Is PR WIP')
+    parser.add_option('--num_commits', dest='num_commits', help='Number of commits in PR')
+    parser.add_option('--author', dest='author', help='PR author') 
+
+    options, _ = parser.parse_args()
+
+    updater = UpdateReadme(options.title, options.url, options.pr_num, options.created, options.last_updated, options.wip, options.num_commits, options.author)
+    updater.run()
\ No newline at end of file

From 1811cc46a228a8d74d12fbc9054bcf514f740511 Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 12:04:33 +0200
Subject: [PATCH 28/33] adapt to gitlab.cern

---
 .github/workflows/code2gitlab.yml  | 22 +++++++---------------
 .github/workflows/update_readme.py |  2 +-
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/code2gitlab.yml b/.github/workflows/code2gitlab.yml
index 466444cb5d..596deebd57 100644
--- a/.github/workflows/code2gitlab.yml
+++ b/.github/workflows/code2gitlab.yml
@@ -14,6 +14,9 @@ jobs:
 
     env:
       PROC_DIR: PROC_${{ matrix.process }}
+      TARGET_HOST: gitlab.cern.ch
+      TARGET_DIR: Madgraph7GenCode
+      TARGET_REPO: MadgraphTeam/${TARGET_DIR}
     
     steps:
       - name: Checkout codegen repo
@@ -42,20 +45,9 @@ jobs:
           else
             echo "❌ clang-format check failed."
           fi
-
-      - name: Generate GitHub App token
-        if: ${{ steps.clang-format-check.outputs.files2format == '0' }}
-        id: app-token
-        uses: actions/create-github-app-token@v1
-        with:
-          app-id: ${{ secrets.GH_APP_ID }}
-          private-key: ${{ secrets.GH_APP_PRIVATE_KEY }}
-          repositories: madgraph4gpu-generated-processes
-
       - name: Commit to target repository
         if: ${{ steps.clang-format-check.outputs.files2format == '0' }}
-        env:
-          TARGET_REPO: roiser/madgraph4gpu-generated-processes
+        id: commit_to_target_repository
         run: |
           # Configure git
 
@@ -64,8 +56,8 @@ jobs:
           
           # Clone target repo
           cd ${GITHUB_WORKSPACE}
-          git clone https://x-access-token:${{ steps.app-token.outputs.token }}@github.com/${TARGET_REPO}.git
-          cd ${GITHUB_WORKSPACE}/madgraph4gpu-generated-processes
+          git clone https://oauth2:${{ secrets.GITLAB_ACCESS_TOKEN }}@${TARGET_HOST}/${TARGET_REPO}.git
+          cd ${GITHUB_WORKSPACE}/${TARGET_DIR}
           
           # Create branch based on PR number
           BRANCH_NAME="codegen-pr-${{ github.event.pull_request.number }}"
@@ -92,7 +84,7 @@ jobs:
       - name: Update README.md
         if: ${{ steps.commit_to_target_repository.outcome == 'success' }}
         run: |
-          cd ${GITHUB_WORKSPACE}/madgraph4gpu-generated-processes
+          cd ${GITHUB_WORKSPACE}/${TARGET_DIR}
           git checkout main
 
           python ${GITHUB_WORKSPACE}/.github/workflows/update_readme.py \
diff --git a/.github/workflows/update_readme.py b/.github/workflows/update_readme.py
index 21c72607e0..4c8f2172dc 100644
--- a/.github/workflows/update_readme.py
+++ b/.github/workflows/update_readme.py
@@ -13,7 +13,7 @@ def __init__(self, title, url, pr_num, created, last_updated, wip, num_commits,
         self.wip = wip
         self.num_commits = num_commits
         self.author = author
-        self.remote_repo_url = "https://github.com/roiser/madgraph4gpu-generated-processes/"
+        self.remote_repo_url = "https://gitlab.cern.ch/MadGraphTeam/Madgraph7GenCode/"
 
     def update_json(self):
         fh = open('readme_data.json', 'r')

From 637a3c8fad5610382e2dada7622e03d2ae0afcca Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 13:15:07 +0200
Subject: [PATCH 29/33] fix invoation

---
 .github/workflows/code2gitlab.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/code2gitlab.yml b/.github/workflows/code2gitlab.yml
index 596deebd57..21e47fb1e4 100644
--- a/.github/workflows/code2gitlab.yml
+++ b/.github/workflows/code2gitlab.yml
@@ -28,8 +28,7 @@ jobs:
       - name: Run code generator
         run: |
           cd ${GITHUB_WORKSPACE}
-          if [ ! -e MG5aMC/mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT ] ; then pushd MG5aMC/mg5amcnlo/PLUGIN/ && ln -s ../../../epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT && popd ; fi
-          ./MG5aMC/mg5amcnlo/bin/mg5_aMC test/processes/${{ matrix.process }}/output.mg5
+          ./bin/mg5_aMC test/processes/${{ matrix.process }}/output.mg5
       
       - name: Check code format with clang-format
         id: clang-format-check

From 8a4eff6d742517ef6161e9b8bbf7d9f39c1e3ca5 Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 13:23:29 +0200
Subject: [PATCH 30/33] run on every push

---
 .github/workflows/code2gitlab.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/code2gitlab.yml b/.github/workflows/code2gitlab.yml
index 21e47fb1e4..549156e359 100644
--- a/.github/workflows/code2gitlab.yml
+++ b/.github/workflows/code2gitlab.yml
@@ -3,6 +3,9 @@ name: Generate code, check format and push to generated processes repo
 on:
   pull_request:
     types: [opened, synchronize, reopened]
+  push:
+    branches:
+      - gen4gitlab # fixme
 
 jobs:
   generate-and-check:

From d602f81c6ed55b67d48acfaddf39ba0544c44322 Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 13:33:23 +0200
Subject: [PATCH 31/33] fix typo

---
 .github/workflows/code2gitlab.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code2gitlab.yml b/.github/workflows/code2gitlab.yml
index 549156e359..17fba5f842 100644
--- a/.github/workflows/code2gitlab.yml
+++ b/.github/workflows/code2gitlab.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Run code generator
         run: |
           cd ${GITHUB_WORKSPACE}
-          ./bin/mg5_aMC test/processes/${{ matrix.process }}/output.mg5
+          ./bin/mg5_aMC tests/processes/${{ matrix.process }}/output.mg5
       
       - name: Check code format with clang-format
         id: clang-format-check

From 67fe626ea273f6661c69cd01f3f3d561b82def7b Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 13:36:38 +0200
Subject: [PATCH 32/33] its intended to run only for a pr

---
 .github/workflows/code2gitlab.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/code2gitlab.yml b/.github/workflows/code2gitlab.yml
index 17fba5f842..b549002347 100644
--- a/.github/workflows/code2gitlab.yml
+++ b/.github/workflows/code2gitlab.yml
@@ -3,14 +3,10 @@ name: Generate code, check format and push to generated processes repo
 on:
   pull_request:
     types: [opened, synchronize, reopened]
-  push:
-    branches:
-      - gen4gitlab # fixme
 
 jobs:
   generate-and-check:
     runs-on: ubuntu-latest
-
     strategy:
       matrix:
         process: [pp_ttx, pp_ttxj]

From dd14b1536fe8fe6ab69a98fd47937fd3047d149e Mon Sep 17 00:00:00 2001
From: Stefan Roiser <stefan.roiser@cern.ch>
Date: Tue, 31 Mar 2026 13:38:01 +0200
Subject: [PATCH 33/33] empty