diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 7dd6a2f963..01bc380fcb 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -1,83 +1,83 @@
-name: C/C++ CI
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-  workflow_dispatch:
-
-jobs:
-  debug_builds:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum ]
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v2
-    - name: make debug
-      run: make -C ${{ matrix.folder }} debug
-  CPU:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum , epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
-        precision: [ d , f , m ]
-        backend: [ cppnone, cppauto ]
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v2
-    - name: github PR info
-      run: date; echo github.event.pull_request.head.sha='${{ github.event.pull_request.head.sha }}'
-    - name: make info
-      run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info
-    - name: make
-      run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
-    - name: make test
-      run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test
-  CPU_MAC:
-    runs-on: macos-latest
-    env:
-      FC: gfortran-14 # see #971
-    strategy:
-      matrix:
-        folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum, epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
-        precision: [ d , f , m ]
-        backend: [ cppnone, cppsse4 ]
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v2
-    - name: github PR info
-      run: date; echo github.event.pull_request.head.sha='${{ github.event.pull_request.head.sha }}'
-    - name: make info
-      run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info
-    - name: make
-      run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
-    - name: make test
-      run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test
-  GPU:
-    runs-on: self-hosted
-    # runs-on: madgraph5-h100
-    # container: registry.cern.ch/ngt/lxplus-like:9
-    env:
-      CUDA_HOME: /usr/local/cuda/
-      FC: gfortran
-    strategy:
-      matrix:
-        folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum , epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
-        precision: [ d , f , m ]
-        backend: [ cppauto, cuda ]
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v2
-    - name: path
-      run: echo "PATH=$PATH"
-    - name: github PR info
-      run: date; echo github.event.pull_request.head.sha='${{ github.event.pull_request.head.sha }}'
-    - name: make info
-      run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info
-    - name: make
-      run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
-    - name: make test
-      run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test
+# name: C/C++ CI
+#
+# on:
+#   push:
+#     branches: [ master ]
+#   pull_request:
+#     branches: [ master ]
+#   workflow_dispatch:
+#
+# jobs:
+#   debug_builds:
+#     runs-on: ubuntu-latest
+#     strategy:
+#       matrix:
+#         folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum ]
+#       fail-fast: false
+#     steps:
+#     - uses: actions/checkout@v2
+#     - name: make debug
+#       run: make -C ${{ matrix.folder }} debug
+#   CPU:
+#     runs-on: ubuntu-latest
+#     strategy:
+#       matrix:
+#         folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum , epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
+#         precision: [ d , f , m ]
+#         backend: [ cppnone, cppauto ]
+#       fail-fast: false
+#     steps:
+#     - uses: actions/checkout@v2
+#     - name: github PR info
+#       run: date; echo github.event.pull_request.head.sha='${{ github.event.pull_request.head.sha }}'
+#     - name: make info
+#       run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info
+#     - name: make
+#       run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+#     - name: make test
+#       run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test
+#   CPU_MAC:
+#     runs-on: macos-latest
+#     env:
+#       FC: gfortran-14 # see #971
+#     strategy:
+#       matrix:
+#         folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum, epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
+#         precision: [ d , f , m ]
+#         backend: [ cppnone, cppsse4 ]
+#       fail-fast: false
+#     steps:
+#     - uses: actions/checkout@v2
+#     - name: github PR info
+#       run: date; echo github.event.pull_request.head.sha='${{ github.event.pull_request.head.sha }}'
+#     - name: make info
+#       run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info
+#     - name: make
+#       run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+#     - name: make test
+#       run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test
+#   GPU:
+#     runs-on: self-hosted
+#     # runs-on: madgraph5-h100
+#     # container: registry.cern.ch/ngt/lxplus-like:9
+#     env:
+#       CUDA_HOME: /usr/local/cuda/
+#       FC: gfortran
+#     strategy:
+#       matrix:
+#         folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum , epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
+#         precision: [ d , f , m ]
+#         backend: [ cppauto, cuda ]
+#       fail-fast: false
+#     steps:
+#     - uses: actions/checkout@v2
+#     - name: path
+#       run: echo "PATH=$PATH"
+#     - name: github PR info
+#       run: date; echo github.event.pull_request.head.sha='${{ github.event.pull_request.head.sha }}'
+#     - name: make info
+#       run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info
+#     - name: make
+#       run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }}
+#     - name: make test
+#       run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test
diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh
index 624f32ba09..69a243d96a 100755
--- a/.github/workflows/testsuite_oneprocess.sh
+++ b/.github/workflows/testsuite_oneprocess.sh
@@ -165,7 +165,15 @@ function build() {
       make -f cudacpp.mk gtestlibs
     fi
     # NB: 'make bldall' internally checks if 'which nvcc' and 'which hipcc' succeed before attempting to build cuda and hip
-    make -j bldall
+    if [ "${proc##*.}" == "sa" ]; then
+        # for standalone, just use the makefile (symlinked to cudacpp.mk)
+        rm cudacpp_overlay.mk
+        ls -l
+        make -j bldall
+    else
+        # makefile overlay after removing patches
+        make -j -f makefile -f cudacpp_overlay.mk bldall
+    fi
     popd >& /dev/null
   done
 }
@@ -268,7 +276,13 @@ function tput_test() {
           ECHO
           echo "DEBUG: execute tests in directory ${bdir}"
           if [ ! -f ${bdir}/runTest_${suffix}.exe ]; then echo "ERROR! ${bdir}/runTest_${suffix}.exe not found?"; exit 1; fi
+          #
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+          ldd ${bdir}/runTest_${suffix}.exe
+          LD_DEBUG=libs ${bdir}/runTest_${suffix}.exe 2>&1 | head -n 80
+          command -v objdump >/dev/null && objdump -d "${bdir}/runTest_${suffix}.exe" | head -n 60 || true
           runExe ${bdir}/runTest_${suffix}.exe
+          #
           if [ ! -f ${bdir}/check_${suffix}.exe ]; then echo "ERROR! ${bdir}/check_${suffix}.exe not found?"; exit 1; fi
           runExe ${bdir}/check_${suffix}.exe -p 1 32 1
         done
diff --git a/.github/workflows/testsuite_oneprocess.yml b/.github/workflows/testsuite_oneprocess.yml
index a43fe7939a..b1659a1c36 100644
--- a/.github/workflows/testsuite_oneprocess.yml
+++ b/.github/workflows/testsuite_oneprocess.yml
@@ -236,6 +236,20 @@ jobs:
           buildcache-${{ runner.os }}-${{ inputs.process }}-${{ matrix.fptype }}-${{ steps.split.outputs.prnum }}
           buildcache-${{ runner.os }}-${{ inputs.process }}-${{ matrix.fptype }}
 
+    - name: Runner CPU info
+      run: |
+        uname -a
+        lscpu || true
+        cat /proc/cpuinfo | sed -n '1,20p' || true
+        echo "CC: $CC"; $CC --version || true
+        echo "CXX: $CXX"; $CXX --version || true
+        echo "FC: $FC";  $FC  --version || true
+
+    - name: Enable core dumps
+      run: |
+        ulimit -c unlimited
+        echo 'core.%e.%p' | sudo tee /proc/sys/kernel/core_pattern
+
     - name: before_build
       run: .github/workflows/testsuite_oneprocess.sh before_build ${{ inputs.process }}
  
@@ -257,7 +271,39 @@ jobs:
         key: buildcache-${{ runner.os }}-${{ inputs.process }}-${{ matrix.fptype }}-${{ steps.split.outputs.prnum }}-${{ github.run_id }}
 
     - name: tput_test
-      run: .github/workflows/testsuite_oneprocess.sh tput_test ${{ inputs.process }}
+        #run: .github/workflows/testsuite_oneprocess.sh tput_test ${{ inputs.process }}
+      run: |
+        set -euo pipefail
+        ulimit -c unlimited
+        .github/workflows/testsuite_oneprocess.sh tput_test ${{ inputs.process }}
+
+    - name: If crash, print SIGILL site
+      if: always()
+      run: |
+          shopt -s nullglob
+          sudo apt-get update
+          sudo apt-get install gdb
+          for c in $(find . -name 'core.*' -maxdepth 6); do
+            exe=$(echo "$c" | sed -n 's/.*core\.\(.*\)\.[0-9]\+/\1/p')
+            echo "---- $c (exe guess: $exe) ----"
+            for ff in f d m; do
+              exe_full_path="${c%/*}/build.none_${ff}_inl0_hrd0/$exe"
+              if [ -f "$exe_full_path" ]; then echo "OK"; else continue; fi
+              echo ""
+              echo "==============="
+              echo "$exe_full_path"
+              echo "==============="
+              command -v gdb >/dev/null && gdb -batch -q "$exe_full_path" "$c" -ex 'info reg' -ex 'bt' || true
+              gdb -batch -q "$exe_full_path" "$c" -ex 'info files' -ex 'bt' -ex 'x/12i $rip'
+              echo ""
+              command -v objdump >/dev/null && objdump -d "$exe_full_path" | head -n 60 || true
+              echo ""
+              objdump -d "$exe_full_path" | grep -nE '\bzmm|k[0-7]\b|evex'
+              # or a broader signature:
+              objdump -d "$exe_full_path" | grep -nE 'v[a-z].*zmm|k[0-7]'
+              echo ""
+            done
+          done
 
     - name: tmad_test
       run: .github/workflows/testsuite_oneprocess.sh tmad_test ${{ inputs.process }}
diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index 3c332e08bc..0e967a68c1 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit 3c332e08bcaf2b902b67f5c5948601b14891aad2
+Subproject commit 0e967a68c19771919d0131ec0e9120b8541a79ba
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
deleted file mode 100644
index 1c5e505267..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+++ /dev/null
@@ -1,152 +0,0 @@
-diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
-index af6d02998..ec5722702 100644
---- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
-+++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
-@@ -76,7 +76,15 @@ c      common/to_colstats/ncols,ncolflow,ncolalt,ic
- 
-       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
-       INTEGER VECSIZE_USED
--      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
-+
-+      character*255 env_name, env_value
-+      integer env_length, env_status
-+
-+#ifdef MG5AMC_MEEXPORTER_CUDACPP
-+      INCLUDE 'fbridge.inc'
-+c     INCLUDE 'fbridge_common.inc'
-+#endif
-+      INCLUDE 'fbridge_common.inc'
- 
- C-----
- C  BEGIN CODE
-@@ -84,6 +92,61 @@ C-----
-       call cpu_time(t_before)
-       CUMULATED_TIMING = t_before
- 
-+#ifdef _OPENMP
-+      CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD()
-+#endif
-+      CALL COUNTERS_INITIALISE()
-+
-+#ifdef MG5AMC_MEEXPORTER_CUDACPP
-+      fbridge_mode = 1 ! CppOnly=1, default for CUDACPP
-+#else
-+      fbridge_mode = 0 ! FortranOnly=0, default for FORTRAN
-+#endif
-+      env_name = 'CUDACPP_RUNTIME_FBRIDGEMODE'
-+      call get_environment_variable(env_name, env_value, env_length, env_status)
-+      if( env_status.eq.0 ) then
-+        write(*,*) 'Found environment variable "', trim(env_name), '" with value "', trim(env_value), '"'
-+        read(env_value,'(I255)') FBRIDGE_MODE ! see https://gcc.gnu.org/onlinedocs/gfortran/ICHAR.html
-+        write(*,*) 'FBRIDGE_MODE (from env) = ', FBRIDGE_MODE
-+      else if( env_status.eq.1 ) then ! 1 = not defined
-+        write(*,*) 'FBRIDGE_MODE (default) = ', FBRIDGE_MODE
-+      else ! -1 = too long for env_value, 2 = not supported by O/S
-+        write(*,*) 'ERROR! get_environment_variable failed for "', trim(env_name), '"'
-+        STOP
-+      endif
-+#ifndef MG5AMC_MEEXPORTER_CUDACPP
-+      if( fbridge_mode.ne.0 ) then
-+        write(*,*) 'ERROR! Invalid fbridge_mode (in FORTRAN backend mode) = ', fbridge_mode
-+        STOP
-+      endif
-+#endif
-+
-+      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
-+      env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
-+      call get_environment_variable(env_name, env_value, env_length, env_status)
-+      if( env_status.eq.0 ) then
-+        write(*,*) 'Found environment variable "', trim(env_name), '" with value "', trim(env_value), '"'
-+        read(env_value,'(I255)') VECSIZE_USED ! see https://gcc.gnu.org/onlinedocs/gfortran/ICHAR.html
-+        write(*,*) 'VECSIZE_USED (from env) = ', VECSIZE_USED
-+      else if( env_status.eq.1 ) then ! 1 = not defined
-+        write(*,*) 'VECSIZE_USED (default) = ', VECSIZE_USED
-+      else ! -1 = too long for env_value, 2 = not supported by O/S
-+        write(*,*) 'ERROR! get_environment_variable failed for "', trim(env_name), '"'
-+        STOP
-+      endif
-+      if( VECSIZE_USED.gt.VECSIZE_MEMMAX .or. VECSIZE_USED.le.0 ) then
-+        write(*,*) 'ERROR! Invalid VECSIZE_USED = ', VECSIZE_USED
-+        STOP
-+      endif
-+
-+#ifdef MG5AMC_MEEXPORTER_CUDACPP
-+      CALL FBRIDGECREATE(FBRIDGE_PBRIDGE, VECSIZE_USED, NEXTERNAL, 4) ! this must be at the beginning as it initialises the CUDA device
-+      FBRIDGE_NCBYF1 = 0
-+      FBRIDGE_CBYF1SUM = 0
-+      FBRIDGE_CBYF1SUM2 = 0
-+      FBRIDGE_CBYF1MAX = -1D100
-+      FBRIDGE_CBYF1MIN = 1D100
-+#endif
- c
- c     Read process number
- c
-@@ -217,9 +280,33 @@ c      call sample_result(xsec,xerr)
- c      write(*,*) 'Final xsec: ',xsec
- 
-       rewind(lun)
--
-       close(lun)
- 
-+#ifdef MG5AMC_MEEXPORTER_CUDACPP
-+      CALL FBRIDGEDELETE(FBRIDGE_PBRIDGE) ! this must be at the end as it shuts down the CUDA device
-+      IF( FBRIDGE_MODE .LE. -1 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
-+        WRITE(*,'(a,f10.8,a,e8.2)')
-+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: MIN = ',
-+     &    FBRIDGE_CBYF1MIN + 1, ' = 1 - ', -FBRIDGE_CBYF1MIN
-+        WRITE(*,'(a,f10.8,a,e8.2)')
-+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: MAX = ',
-+     &    FBRIDGE_CBYF1MAX + 1, ' = 1 + ', FBRIDGE_CBYF1MAX
-+        WRITE(*,'(a,i6)')
-+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: NENTRIES = ',
-+     &    FBRIDGE_NCBYF1
-+c        WRITE(*,'(a,e8.2)')
-+c    &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: AVG = ',
-+c    &    FBRIDGE_CBYF1SUM / FBRIDGE_NCBYF1
-+c       WRITE(*,'(a,e8.2)')
-+c    &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: STD = ',
-+c    &    SQRT( FBRIDGE_CBYF1SUM2 / FBRIDGE_NCBYF1 ) ! ~standard deviation
-+        WRITE(*,'(a,e8.2,a,e8.2)')
-+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: AVG = ',
-+     &    FBRIDGE_CBYF1SUM / FBRIDGE_NCBYF1, ' +- ',
-+     &    SQRT( FBRIDGE_CBYF1SUM2 ) / FBRIDGE_NCBYF1 ! ~standard error
-+      ENDIF
-+#endif
-+      CALL COUNTERS_FINALISE()
-       end
- 
- c     $B$ get_user_params $B$ ! tag for MadWeight
-diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
-index bf488e4b0..707ea4032 100644
---- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
-+++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
-@@ -71,7 +71,10 @@ C
-       DATA NB_FAIL /0/
-       DOUBLE PRECISION GET_CHANNEL_CUT
-       EXTERNAL GET_CHANNEL_CUT
--
-+C
-+      INTEGER NGOODHEL ! -1 if not yet retrieved and printed
-+      SAVE NGOODHEL
-+      DATA NGOODHEL/-1/
- C     
- C     This is just to temporarily store the reference grid for
- C      helicity of the DiscreteSampler so as to obtain its number of
-@@ -224,6 +227,17 @@ C            update.
-           ENDIF
-           IF(NTRY(1).EQ.MAXTRIES)THEN
-             ISHEL=MIN(ISUM_HEL,NGOOD)
-+C           Print the number of good helicities
-+            IF (NGOODHEL.EQ.-1) THEN
-+              NGOODHEL=0
-+              DO I=1,NCOMB
-+                IF (GOODHEL(I,1)) THEN
-+                  NGOODHEL=NGOODHEL+1
-+                ENDIF
-+              END DO
-+              WRITE (6,*) 'NGOODHEL =', NGOODHEL
-+              WRITE (6,*) 'NCOMB =', NCOMB
-+            ENDIF
-           ENDIF
-         ENDIF
-       ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
deleted file mode 100644
index ce1c49dc2a..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+++ /dev/null
@@ -1,312 +0,0 @@
-diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
-index 348c283be..49e6800ff 100644
---- b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
-+++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
-@@ -1,6 +1,37 @@
-+SHELL := /bin/bash
-+
- include ../../Source/make_opts
-+
-+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
-+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
-+include ../../src/cudacpp_config.mk
-+ifeq ($(CUDACPP_BUILDDIR),)
-+$(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
-+endif
-+
-+# Disable all Fortran warnings?
- FFLAGS+= -w
- 
-+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
-+FFLAGS+= -cpp
-+
-+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
-+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
-+
-+# Add -std=c++17 explicitly to avoid build errors on macOS
-+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
-+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-+CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
-+endif
-+
-+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
-+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
-+  override CXX:=ccache $(CXX)
-+endif
-+###ifeq ($(USECCACHE)$(shell echo $(FC) | grep ccache),1)
-+###  override FC:=ccache $(FC)
-+###endif
-+
- # Load additional dependencies of the bias module, if present
- ifeq (,$(wildcard ../bias_dependencies))
- BIASDEPENDENCIES =
-@@ -24,7 +55,20 @@ else
-     MADLOOP_LIB =
- endif
- 
--LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L../../lib/ -ldhelas -ldsample -lmodel -lgeneric -lpdf -lgammaUPC -lcernlib $(llhapdf) -lbias 
-+LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias 
-+
-+CUDACPP_MAKEFILE=cudacpp.mk
-+processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
-+ifeq ($(BACKEND),cuda)
-+CUDACPP_COMMONLIB=mg5amc_common_cuda
-+CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda
-+else ifeq ($(BACKEND),hip)
-+CUDACPP_COMMONLIB=mg5amc_common_hip
-+CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip
-+else
-+CUDACPP_COMMONLIB=mg5amc_common_cpp
-+CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp
-+endif
- 
- LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
- 
-@@ -43,41 +87,148 @@ ifeq ($(strip $(MATRIX_HEL)),)
- endif
- 
- 
--PROCESS= driver.o myamp.o genps.o unwgt.o setcuts.o get_color.o \
-+PROCESS= myamp.o genps.o unwgt.o setcuts.o get_color.o \
-          cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
--	 idenparts.o dummy_fct.o \
--         $(patsubst %.f,%.o,$(wildcard auto_dsig*.f)) \
-+	 idenparts.o dummy_fct.o
-+
-+DSIG=driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
-+DSIG_cudacpp=driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
- 
- SYMMETRY = symmetry.o idenparts.o 
- 
- # Binaries
- 
--$(PROG): $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX)
--	$(FC) -o $(PROG) $(PROCESS) $(MATRIX) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
-+ifeq ($(UNAME),Darwin)
-+LDFLAGS += -lc++ # avoid 'Undefined symbols' for chrono::steady_clock on macOS (checked with otool -L libmg5amc_gg_ttx_cpp.so) 
-+LDFLAGS += -mmacosx-version-min=11.3 # avoid "ld: warning: object file was built for newer macOS version than being linked"  
-+else
-+LDFLAGS += -Wl,--no-relax # avoid 'failed to convert GOTPCREL relocation' error #458 (not supported on macOS)
-+endif
- 
--$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
--	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
-+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
-+.DEFAULT_GOAL := all
- 
--gensym: $(SYMMETRY) configs.inc $(LIBS)
--	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
-+ifeq ($(BACKEND),cuda)
-+all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-+else ifeq ($(BACKEND),hip)
-+all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
-+else
-+all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-+endif
- 
--$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
--	cd ../../Source/MODEL; make
-+# Disable OpenMP by default: enable OpenMP only if USEOPENMP=1 (#758)
-+ifeq ($(USEOPENMP),1)
-+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
-+override OMPFLAGS = -fopenmp
-+LINKLIBS += -liomp5 # see #578
-+LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
-+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
-+override OMPFLAGS = -fopenmp
-+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
-+else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
-+override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
-+else
-+override OMPFLAGS = -fopenmp
-+endif
-+endif
-+
-+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-+	$(FC) -o $(PROG)_fortran $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
- 
--$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-+$(LIBS): .libs
-+
-+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
- 	cd ../../Source; make
-+	touch $@
-+
-+$(CUDACPP_BUILDDIR)/.cudacpplibs:
-+	$(MAKE) -f $(CUDACPP_MAKEFILE)
-+	touch $@
-+
-+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
-+# Use relative paths with respect to the executables ($ORIGIN on Linux)
-+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
-+ifeq ($(UNAME_S),Darwin)
-+  override LIBFLAGSRPATH =
-+else ifeq ($(USEBUILDDIR),1)
-+  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
-+else
-+  override LIBFLAGSRPATH = -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
-+endif
-+
-+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
-+
-+madevent_fortran_link: $(PROG)_fortran
-+	rm -f $(PROG)
-+	ln -s $(PROG)_fortran $(PROG)
-+
-+madevent_cuda_link:
-+	$(MAKE) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
-+	rm -f $(PROG)
-+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
-+
-+madevent_hip_link:
-+	$(MAKE) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
-+	rm -f $(PROG)
-+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
-+
-+madevent_cpp_link:
-+	$(MAKE) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-+	rm -f $(PROG)
-+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
- 
--$(LIBDIR)libpdf.$(libext): 
--	cd ../../Source/PDF; make
-+override SUPPORTED_AVXS = cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
-+madevent_%_link:
-+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then echo "ERROR! Invalid target '$@' (supported madevent_cpp*_link targets are: $(foreach avx,$(SUPPORTED_AVXS),'madevent_cpp$(avx)_link'))"; exit 1; fi
-+	$(MAKE) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
-+	rm -f $(PROG)
-+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
- 
--$(LIBDIR)libgammaUPC.$(libext):
--	cd ../../Source/PDF/gammaUPC; make
-+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
-+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-+
-+# Building $(PROG)_cuda now uses its own rule
-+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-+
-+# Building $(PROG)_hip also uses its own rule
-+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-+
-+counters.o: counters.cc timer.h
-+	$(CXX) $(CXXFLAGS) -c $< -o $@
-+
-+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
-+
-+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
-+	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
-+
-+gensym: $(SYMMETRY) configs.inc $(LIBS)
-+	$(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
-+
-+###ifeq (,$(wildcard fbridge.inc)) # Pointless: fbridge.inc always exists as this is the cudacpp-modified makefile!
-+###$(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
-+###	cd ../../Source/MODEL; make
-+###
-+###$(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
-+###	cd ../../Source; make
-+###
-+###$(LIBDIR)libpdf.$(libext): 
-+###	cd ../../Source/PDF; make
-+###
-+###$(LIBDIR)libgammaUPC.$(libext):
-+###	cd ../../Source/PDF/gammaUPC; make
-+###endif
- 
- # Add source so that the compiler finds the DiscreteSampler module.
- $(MATRIX): %.o: %.f
- 	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
- %.o: %.f
- 	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
-+%_cudacpp.o: %.f
-+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
- 
- # Dependencies
- 
-@@ -97,5 +248,80 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
- 	 run_config.inc
- initcluster.o: message.inc
- 
--clean:
--	$(RM) *.o gensym madevent madevent_forhel
-+# Extra dependencies on discretesampler.mod
-+
-+auto_dsig.o: .libs
-+driver.o: .libs
-+driver_cudacpp.o: .libs
-+$(MATRIX): .libs
-+genps.o: .libs
-+
-+# Cudacpp bldall targets
-+
-+ifeq ($(UNAME_P),ppc64le)
-+bldavxs: bldnone bldsse4
-+else ifeq ($(UNAME_P),arm)
-+bldavxs: bldnone bldsse4
-+else
-+bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
-+endif
-+
-+ifneq ($(shell which hipcc 2>/dev/null),)
-+ifneq ($(shell which nvcc 2>/dev/null),)
-+bldall: bldhip bldcuda bldavxs
-+else
-+bldall: bldhip bldavxs
-+endif
-+else
-+ifneq ($(shell which nvcc 2>/dev/null),)
-+bldall: bldcuda bldavxs
-+else
-+bldall: bldavxs
-+endif
-+endif
-+
-+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=cuda
-+
-+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=hip
-+
-+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=cppnone
-+
-+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=cppsse4
-+
-+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=cppavx2
-+
-+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512y
-+
-+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
-+	@echo
-+	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z
-+
-+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
-+
-+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
-+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(CUDACPP_BUILDDIR)/$(PROG)_hip
-+
-+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
-+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
-+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
-+	rm -f .libs
-+
-+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
-+	make -C ../../Source cleanall
-+	rm -rf $(LIBDIR)libbias.$(libext)
-+	rm -f ../../Source/*.mod ../../Source/*/*.mod
-+
-+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
-+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index b1739da73d..421fb0f97e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -40,7 +40,9 @@ def compile(self, *args, **opts):
             cudacpp_backend = self.run_card['cudacpp_backend'].lower() # the default value is defined in launch_plugin.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend in cudacpp_supported_backends :
-                args[0][0] = 'madevent_' + cudacpp_backend + '_link'
+                new_args = list(args)
+                new_args[0] = ['-f', 'makefile', '-f', 'cudacpp_overlay.mk', 'madevent_' + cudacpp_backend + '_link'] + new_args[0][1:]
+                args = tuple(new_args)
             else:
                 raise Exception( "Invalid cudacpp_backend='%s': supported backends are %s"%supported_backends )
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk
new file mode 100644
index 0000000000..0af10482d7
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_overlay.mk
@@ -0,0 +1,297 @@
+# cudacpp_overlay.mk
+# To be used after the project makefile
+# Usage: make -f makefile -f cudacpp_overlay.mk ...
+SHELL := /bin/bash
+
+# Recursive-make helper
+PRIMARY_MK ?= makefile
+OVERLAY_MK ?= cudacpp_overlay.mk
+SELF_MF := -f $(PRIMARY_MK) -f $(OVERLAY_MK)
+
+# Determine CUDACPP_BUILDDIR based on the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD and USEBUILDDIR (#829)
+# Stop with an error if BACKEND=cuda and nvcc is missing or if BACKEND=hip and hipcc is missing
+include ../../src/cudacpp_config.mk
+ifeq ($(CUDACPP_BUILDDIR),)
+  $(error CUDACPP_BUILDDIR='$(CUDACPP_BUILDDIR)' should not be empty!)
+endif
+
+# Basic uname helpers (if not already set)
+UNAME_S ?= $(shell uname -s)
+UNAME_P ?= $(shell uname -p)
+
+# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html
+FFLAGS+= -cpp
+
+# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740)
+CXXFLAGS = -O3 -Wall -Wshadow -Wextra
+
+# Add -std=c++17 explicitly to avoid build errors on macOS
+# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked"
+ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+	CXXFLAGS += -std=c++17 -mmacosx-version-min=11.3
+endif
+
+# Enable ccache for C++ if USECCACHE=1 (do not enable it for Fortran since it is not supported for Fortran)
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+	override CXX := ccache $(CXX)
+endif
+
+# ----------------------------------------------------------------------
+# Backend library names and process id
+# ----------------------------------------------------------------------
+CUDACPP_MAKEFILE := cudacpp.mk
+processid_short  := $(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+
+ifeq ($(BACKEND),cuda)
+	CUDACPP_COMMONLIB := mg5amc_common_cuda
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cuda
+else ifeq ($(BACKEND),hip)
+	CUDACPP_COMMONLIB := mg5amc_common_hip
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_hip
+else
+	CUDACPP_COMMONLIB := mg5amc_common_cpp
+	CUDACPP_BACKENDLIB := mg5amc_$(processid_short)_cpp
+endif
+
+# ----------------------------------------------------------------------
+# Libraries and link line adjustments
+# ----------------------------------------------------------------------
+# Prefer LIBDIR everywhere; base makefile already defines LIBDIR.
+LINKLIBS := $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) \
+            -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias
+
+# OpenMP: enable only if requested, USEOPENMP=1 (#758)
+ifeq ($(USEOPENMP),1)
+  ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+    override OMPFLAGS = -fopenmp
+    LINKLIBS += -liomp5 # see #578
+    LIBKLIBS += -lintlc # undefined reference to '_intel_fast_memcpy'
+  else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+    override OMPFLAGS = -fopenmp
+    # For the *cpp* binary with clang, ensure libomp is found
+    $(CUDACPP_BUILDDIR)/$(PROG)_cpp: LINKLIBS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+  else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+    override OMPFLAGS = # OMP is not supported yet by cudacpp for Apple clang
+  else
+    override OMPFLAGS = -fopenmp
+  endif
+endif
+
+# ----------------------------------------------------------------------
+# Objects & targets
+# ----------------------------------------------------------------------
+# Keep driver* separate from PROCESS; we form DSIG groups below.
+PROCESS := myamp.o genps.o unwgt.o setcuts.o get_color.o \
+           cuts.o cluster.o reweight.o initcluster.o addmothers.o setscales.o \
+           idenparts.o dummy_fct.o
+
+DSIG := driver.o $(patsubst %.f, %.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+DSIG_cudacpp := driver_cudacpp.o $(patsubst %.f, %_cudacpp.o, $(filter-out auto_dsig.f, $(wildcard auto_dsig*.f)))
+
+SYMMETRY := symmetry.o idenparts.o
+
+# Binaries
+
+ifeq ($(UNAME),Darwin)
+  LDFLAGS += -lc++ -mmacosx-version-min=11.3
+else
+  LDFLAGS += -Wl,--no-relax
+endif
+
+# Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal)
+.DEFAULT_GOAL := all
+ifeq ($(BACKEND),cuda)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+else ifeq ($(BACKEND),hip)
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_hip
+else
+  all: $(PROG)_fortran $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+endif
+
+# Library build stamps
+$(LIBS): .libs
+
+.libs: ../../Cards/param_card.dat ../../Cards/run_card.dat
+	$(MAKE) -C ../../Source
+	touch $@
+
+$(CUDACPP_BUILDDIR)/.cudacpplibs:
+	$(MAKE) -f $(CUDACPP_MAKEFILE)
+	touch $@
+
+# Remove per-library recipes from makefile to avoid duplicate sub-makes
+# under ../../Source running in parallel otherwise we can have race condition
+# Build the libs only via the single .libs stamp.
+
+# Ensure these targets are satisfied by building Source once
+$(LIBDIR)libmodel.$(libext)     : | .libs
+$(LIBDIR)libgeneric.$(libext)   : | .libs
+$(LIBDIR)libpdf.$(libext)       : | .libs
+$(LIBDIR)libgammaUPC.$(libext)  : | .libs
+
+# Override the recipes from makefile_orig with empty recipes
+# (GNU Make will use the last recipe it reads.)
+$(LIBDIR)libmodel.$(libext)     : ; @:
+$(LIBDIR)libgeneric.$(libext)   : ; @:
+$(LIBDIR)libpdf.$(libext)       : ; @:
+$(LIBDIR)libgammaUPC.$(libext)  : ; @:
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override LIBFLAGSRPATH :=
+else ifeq ($(USEBUILDDIR),1)
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/../$(LIBDIR)/$(CUDACPP_BUILDDIR)'
+else
+  override LIBFLAGSRPATH := -Wl,-rpath,'$$ORIGIN/$(LIBDIR)'
+endif
+
+# Final link steps
+$(PROG)_fortran: $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
+	$(FC) -o $@ $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
+
+# Building $(PROG)_cpp no longer builds $(PROG)_cuda if CUDACPP_BACKENDLIB for cuda exists (this was the case in the past to allow cpp-only builds #503)
+$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_cuda now uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Building $(PROG)_hip also uses its own rule
+$(CUDACPP_BUILDDIR)/$(PROG)_hip: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
+	$(FC) -o $@ $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_BACKENDLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
+
+# Helpers compiled with C++
+counters.o: counters.cc timer.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
+	$(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@
+
+# Alternate binaries (kept for parity)
+$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
+	$(FC) -o $@ $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
+
+gensym: $(SYMMETRY) configs.inc $(LIBS)
+	$(FC) -o $@ $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS)
+
+# Compile rules (override base ones)
+$(MATRIX): %.o: %.f
+	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%.o: %.f
+	$(FC) $(FFLAGS) -c $< -I../../Source/ -I../../Source/PDF/gammaUPC
+
+%_cudacpp.o: %.f
+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
+
+# Extra dependencies on discretesampler.mod
+auto_dsig.o: .libs
+driver.o: .libs
+driver_cudacpp.o: .libs
+$(MATRIX): .libs
+genps.o: .libs
+
+# Convenience link targets to switch $(PROG) symlink
+.PHONY: madevent_fortran_link madevent_cuda_link madevent_hip_link madevent_cpp_link
+madevent_fortran_link: $(PROG)_fortran
+	rm -f $(PROG)
+	ln -s $(PROG)_fortran $(PROG)
+
+madevent_cuda_link:
+	$(MAKE) $(SELF_MF) USEGTEST=0 BACKEND=cuda $(CUDACPP_BUILDDIR)/$(PROG)_cuda
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG)
+
+madevent_hip_link:
+	$(MAKE) $(SELF_MF) USEGTEST=0 BACKEND=hip $(CUDACPP_BUILDDIR)/$(PROG)_hip
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_hip $(PROG)
+
+madevent_cpp_link:
+	$(MAKE) $(SELF_MF) USEGTEST=0 BACKEND=cppauto $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Variant AVX builds for cpp backend
+override SUPPORTED_AVXS := cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+madevent_%_link:
+	@if [ '$(words $(filter $*, $(SUPPORTED_AVXS)))' != '1' ]; then \
+	  echo "ERROR! Invalid target '$@' (supported: $(foreach avx,$(SUPPORTED_AVXS),madevent_$(avx)_link))"; exit 1; fi
+	$(MAKE) $(SELF_MF) USEGTEST=0 BACKEND=$* $(CUDACPP_BUILDDIR)/$(PROG)_cpp
+	rm -f $(PROG)
+	ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROG)
+
+# Cudacpp bldall targets
+ifeq ($(UNAME_P),ppc64le)
+  bldavxs: bldnone bldsse4
+else ifeq ($(UNAME_P),arm)
+  bldavxs: bldnone bldsse4
+else
+  bldavxs: bldnone bldsse4 bldavx2 bld512y bld512z
+endif
+
+ifneq ($(shell which hipcc 2>/dev/null),)
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldhip bldcuda bldavxs
+  else
+    bldall: bldhip bldavxs
+  endif
+else
+  ifneq ($(shell which nvcc 2>/dev/null),)
+    bldall: bldcuda bldavxs
+  else
+    bldall: bldavxs
+  endif
+endif
+
+bldcuda: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=cuda
+
+bldhip: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=hip
+
+bldnone: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=cppnone
+
+bldsse4: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=cppsse4
+
+bldavx2: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=cppavx2
+
+bld512y: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=cpp512y
+
+bld512z: $(PROG)_fortran $(DSIG_cudacpp)
+	@echo
+	$(MAKE) $(SELF_MF) USEBUILDDIR=1 BACKEND=cpp512z
+
+# Clean (NB: 'make clean' in Source calls 'make clean' in all P*)
+clean: # Clean builds: fortran in this Pn; cudacpp executables for one AVX in this Pn
+	$(RM) *.o gensym $(PROG) $(PROG)_fortran $(PROG)_forhel \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cpp \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_cuda \
+	       $(CUDACPP_BUILDDIR)/$(PROG)_hip
+
+cleanavxs: clean # Clean builds: fortran in this Pn; cudacpp for all AVX in this Pn and in src
+	$(MAKE) -f $(CUDACPP_MAKEFILE) cleanall
+	rm -f $(CUDACPP_BUILDDIR)/.cudacpplibs
+	rm -f .libs
+
+cleanall: # Clean builds: fortran in all P* and in Source; cudacpp for all AVX in all P* and in src
+	$(MAKE) -C ../../Source cleanall
+	rm -rf $(LIBDIR)libbias.$(libext)
+	rm -f ../../Source/*.mod ../../Source/*/*.mod
+
+distclean: cleanall # Clean all fortran and cudacpp builds as well as the googletest installation
+	$(MAKE) -f $(CUDACPP_MAKEFILE) distclean
+
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 209f088314..839ab5c62e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -121,6 +121,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                                       s+'gpu/MadgraphTest.h', s+'gpu/runTest.cc',
                                       s+'gpu/testmisc.cc', s+'gpu/testxxx_cc_ref.txt', s+'gpu/valgrind.h',
                                       s+'gpu/perf.py', s+'gpu/profile.sh',
+                                      s+'gpu/cudacpp_overlay.mk',
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
                      'test': [s+'gpu/cudacpp_test.mk']}
 
@@ -144,6 +145,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                     'MadgraphTest.h', 'runTest.cc',
                     'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h',
                     'cudacpp.mk', # this is generated from a template in Subprocesses but we still link it in P1
+                    'cudacpp_overlay.mk', # this is generated from a template in Subprocesses but we still link it in P1
                     'testxxx.cc', # this is generated from a template in Subprocesses but we still link it in P1
                     'MemoryBuffers.h', # this is generated from a template in Subprocesses but we still link it in P1
                     'MemoryAccessCouplings.h', # this is generated from a template in Subprocesses but we still link it in P1
@@ -237,8 +239,8 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             outputflags is a list of options provided when doing the output command"""
         ###misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
         if self.in_madevent_mode:
-            if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL']
-            else: patchlevel = ''
+            # if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL']
+            # else: patchlevel = ''
             # OLDEST implementation (AV)
             #path = os.path.realpath(os.curdir + os.sep + 'PLUGIN' + os.sep + 'CUDACPP_OUTPUT')
             #misc.sprint(path)
@@ -255,17 +257,46 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
             # **NB** AV: change the Popen call to always dump stdout and stderr, because I want to always see the output
             # **NB** AV: this also allows error checking by looking for error strings on the generation log if patchMad.sh silently fails
             # **NB** AV: (e.g. this did happen in the past, when patchMad.sh was calling 'madevent treatcards run', and the latter silently failed)
-            plugin_path = os.path.dirname(os.path.realpath( __file__ ))
+            # plugin_path = os.path.dirname(os.path.realpath( __file__ ))
             ###p = subprocess.Popen([pjoin(plugin_path, 'patchMad.sh'), self.dir_path , 'PROD', str(patchlevel)],
             ###                     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            p = subprocess.Popen([pjoin(plugin_path, 'patchMad.sh'), self.dir_path , 'PROD', str(patchlevel)]) # AV always dump patchMad.sh stdout/stderr
-            stdout, stderr = p.communicate()
-            misc.sprint(p.returncode)
-            if p.returncode != 0: # AV: WARNING! do not fully trust this check! patchMad.sh was observed to silently fail in the past...
-                logger.debug("####### \n stdout is \n %s", stdout)
-                logger.info("####### \n stderr is \n %s", stderr)
-                logger.info("return code is %s\n", p.returncode)
-                raise Exception('ERROR! the O/S call to patchMad.sh failed')
+            # p = subprocess.Popen([pjoin(plugin_path, 'patchMad.sh'), self.dir_path , 'PROD', str(patchlevel)]) # AV always dump patchMad.sh stdout/stderr
+            # stdout, stderr = p.communicate()
+            # misc.sprint(p.returncode)
+            # if p.returncode != 0: # AV: WARNING! do not fully trust this check! patchMad.sh was observed to silently fail in the past...
+            #     logger.debug("####### \n stdout is \n %s", stdout)
+            #     logger.info("####### \n stderr is \n %s", stderr)
+            #     logger.info("return code is %s\n", p.returncode)
+            #     raise Exception('ERROR! the O/S call to patchMad.sh failed')
+
+            patch_coupl_write = r"""set -euo pipefail
+# Get last fields from lines starting with WRITE(*,2)
+gcs=$(awk '$1=="WRITE(*,2)" {print $NF}' coupl_write.inc)
+
+for gc in $gcs; do
+  if grep -q "$gc(VECSIZE_MEMMAX)" coupl.inc; then
+    awk -v gc="$gc" '{
+      if ($1=="WRITE(*,2)" && $NF==gc) print $0"(1)";
+      else print
+    }' coupl_write.inc > coupl_write.inc.new
+    mv coupl_write.inc.new coupl_write.inc
+  fi
+done"""
+            try:
+                result = subprocess.run(
+                    ["bash", "-lc", patch_coupl_write],
+                    cwd=pjoin(self.dir_path, "Source", "MODEL"),
+                    text=True,
+                    capture_output=True,
+                    check=True,  # raise CalledProcessError on non-zero exit
+                )
+                misc.sprint(result.returncode)
+            except subprocess.CalledProcessError as e:
+                logger.debug("####### \n stdout is \n %s", e.stdout)
+                logger.info("####### \n stderr is \n %s", e.stderr)
+                logger.info("return code is %s\n", e.returncode)
+                raise Exception("ERROR while patching coupl_write.inc") from e
+
             # Additional patching (OM)
             self.add_madevent_plugin_fct() # Added by OM
         # do not call standard finalize since is this is already done...
@@ -332,6 +363,7 @@ def change_output_args(args, cmd):
         return args
 
 class FortranExporterBridge(export_v4.ProcessExporterFortranMEGroup):
+    _file_path = export_v4._file_path
 
     def write_auto_dsig_file(self, writer, matrix_element, proc_id = ""):
         replace_dict,context = super().write_auto_dsig_file(False, matrix_element, proc_id)
@@ -370,15 +402,121 @@ def write_auto_dsig_file(self, writer, matrix_element, proc_id = ""):
 #endif
 CALL COUNTERS_SMATRIX1MULTI_START( -1, VECSIZE_USED )  ! fortranMEs=-1"""
         replace_dict["OMP_POSTFIX"] = open(pjoin(PLUGINDIR,'madgraph','iolibs','template_files','gpu','smatrix_multi.f')).read().split('\n',4)[4] # AV skip 4 copyright lines
-        _file_path = export_v4._file_path
         if writer:
-            file = open(pjoin(_file_path, 'iolibs/template_files/auto_dsig_v4.inc')).read()
+            file = open(pjoin(self._file_path, 'iolibs/template_files/auto_dsig_v4.inc')).read()
             file = file % replace_dict
             # Write the file
             writer.writelines(file, context=context)
         else:
             return replace_dict, context
 
+    def write_driver(self, writer, *args, **kwargs):
+        """Write the SubProcess/driver.f file with additions from CUDACPP"""
+        replace_dict = super().write_driver(False, *args, **kwargs)
+
+        # Additions from CUDACPP plugin (after patch)
+        replace_dict['CUDACPP_EXTRA_HEADER'] = """
+      character*255 env_name, env_value
+      integer env_length, env_status
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      INCLUDE 'fbridge.inc'
+c     INCLUDE 'fbridge_common.inc'
+#endif
+      INCLUDE 'fbridge_common.inc'
+"""
+
+        replace_dict['CUDACPP_EXTRA_INITIALISE'] = """
+#ifdef _OPENMP
+      CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD()
+#endif
+      CALL COUNTERS_INITIALISE()
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      fbridge_mode = 1 ! CppOnly=1, default for CUDACPP
+#else
+      fbridge_mode = 0 ! FortranOnly=0, default for FORTRAN
+#endif
+      env_name = 'CUDACPP_RUNTIME_FBRIDGEMODE'
+      call get_environment_variable(env_name, env_value, env_length, env_status)
+      if( env_status.eq.0 ) then
+        write(*,*) 'Found environment variable "', trim(env_name), '" with value "', trim(env_value), '"'
+        read(env_value,'(I255)') FBRIDGE_MODE ! see https://gcc.gnu.org/onlinedocs/gfortran/ICHAR.html
+        write(*,*) 'FBRIDGE_MODE (from env) = ', FBRIDGE_MODE
+      else if( env_status.eq.1 ) then ! 1 = not defined
+        write(*,*) 'FBRIDGE_MODE (default) = ', FBRIDGE_MODE
+      else ! -1 = too long for env_value, 2 = not supported by O/S
+        write(*,*) 'ERROR! get_environment_variable failed for "', trim(env_name), '"'
+        STOP
+      endif
+#ifndef MG5AMC_MEEXPORTER_CUDACPP
+      if( fbridge_mode.ne.0 ) then
+        write(*,*) 'ERROR! Invalid fbridge_mode (in FORTRAN backend mode) = ', fbridge_mode
+        STOP
+      endif
+#endif
+
+      vecsize_used = vecsize_memmax ! default ! CppOnly=1, default for CUDACPP
+      env_name = 'CUDACPP_RUNTIME_VECSIZEUSED'
+      call get_environment_variable(env_name, env_value, env_length, env_status)
+      if( env_status.eq.0 ) then
+        write(*,*) 'Found environment variable "', trim(env_name), '" with value "', trim(env_value), '"'
+        read(env_value,'(I255)') VECSIZE_USED ! see https://gcc.gnu.org/onlinedocs/gfortran/ICHAR.html
+        write(*,*) 'VECSIZE_USED (from env) = ', VECSIZE_USED
+      else if( env_status.eq.1 ) then ! 1 = not defined
+        write(*,*) 'VECSIZE_USED (default) = ', VECSIZE_USED
+      else ! -1 = too long for env_value, 2 = not supported by O/S
+        write(*,*) 'ERROR! get_environment_variable failed for "', trim(env_name), '"'
+        STOP
+      endif
+      if( VECSIZE_USED.gt.VECSIZE_MEMMAX .or. VECSIZE_USED.le.0 ) then
+        write(*,*) 'ERROR! Invalid VECSIZE_USED = ', VECSIZE_USED
+        STOP
+      endif
+
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      CALL FBRIDGECREATE(FBRIDGE_PBRIDGE, VECSIZE_USED, NEXTERNAL, 4) ! this must be at the beginning as it initialises the CUDA device
+      FBRIDGE_NCBYF1 = 0
+      FBRIDGE_CBYF1SUM = 0
+      FBRIDGE_CBYF1SUM2 = 0
+      FBRIDGE_CBYF1MAX = -1D100
+      FBRIDGE_CBYF1MIN = 1D100
+#endif
+"""
+
+        replace_dict['CUDACPP_EXTRA_FINALISE'] = """
+#ifdef MG5AMC_MEEXPORTER_CUDACPP
+      CALL FBRIDGEDELETE(FBRIDGE_PBRIDGE) ! this must be at the end as it shuts down the CUDA device
+      IF( FBRIDGE_MODE .LE. -1 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
+        WRITE(*,'(a,f10.8,a,e8.2)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: MIN = ',
+     &    FBRIDGE_CBYF1MIN + 1, ' = 1 - ', -FBRIDGE_CBYF1MIN
+        WRITE(*,'(a,f10.8,a,e8.2)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: MAX = ',
+     &    FBRIDGE_CBYF1MAX + 1, ' = 1 + ', FBRIDGE_CBYF1MAX
+        WRITE(*,'(a,i6)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran: NENTRIES = ',
+     &    FBRIDGE_NCBYF1
+c        WRITE(*,'(a,e8.2)')
+c    &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: AVG = ',
+c    &    FBRIDGE_CBYF1SUM / FBRIDGE_NCBYF1
+c       WRITE(*,'(a,e8.2)')
+c    &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: STD = ',
+c    &    SQRT( FBRIDGE_CBYF1SUM2 / FBRIDGE_NCBYF1 ) ! ~standard deviation
+        WRITE(*,'(a,e8.2,a,e8.2)')
+     &    ' [MERATIOS] ME ratio CudaCpp/Fortran - 1: AVG = ',
+     &    FBRIDGE_CBYF1SUM / FBRIDGE_NCBYF1, ' +- ',
+     &    SQRT( FBRIDGE_CBYF1SUM2 ) / FBRIDGE_NCBYF1 ! ~standard error
+      ENDIF
+#endif
+      CALL COUNTERS_FINALISE()
+"""
+
+        if writer:
+            text = open(pjoin(self._file_path,'iolibs','template_files','madevent_driver.f')).read() % replace_dict
+            writer.write(text)
+            return True
+        return replace_dict
 #------------------------------------------------------------------------------------
 
 class GPU_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh
deleted file mode 100755
index 6122cee227..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
-# Licensed under the GNU Lesser General Public License (version 3 or later).
-# Created by: A. Valassi (Mar 2022) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
-
-set -e # immediate exit on error
-
-status=0
-
-scrdir=$(cd $(dirname $0); pwd)
-
-function usage()
-{
-  echo "ERROR! Unknown command '$0 $*'"
-  echo "Usage: $0 <process_dir> <patch_dir> [--nopatch|--upstream]"
-  exit 1 
-}
-
-# Patch level
-###patchlevel=0 # [--upstream] out of the box codegen from upstream MG5AMC (do not even copy templates)
-###patchlevel=1 # [--nopatch] modify upstream MG5AMC but do not apply patch commands (reference to prepare new patches)
-patchlevel=2 # [DEFAULT] complete generation of cudacpp .sa/.mad (copy templates and apply patch commands)
-
-if [ "$2" == "" ]; then
-  usage $*
-elif [ "$3" == "--nopatch" ]; then
-  if [ "$4" != "" ]; then usage; fi
-  patchlevel=1
-elif [ "$3" == "--upstream" ]; then
-  if [ "$4" != "" ]; then usage; fi
-  patchlevel=0
-elif [ "$3" != "" ]; then
-  usage $*
-fi
-dir=$1
-dir_patches=$2
-###echo "Current dir: $pwd"
-###echo "Input dir to patch: $dir"
-
-if [ ! -e ${dir} ]; then echo "ERROR! Directory $dir does not exist"; exit 1; fi
-
-# Exit here for patchlevel 0 (--upstream)
-if [ "${patchlevel}" == "0" ]; then exit $status; fi
-
-# Patch the default Fortran code to provide the integration with the cudacpp plugin
-# (1) Process-independent patches
-touch ${dir}/Events/.keep # this file should already be present (mg5amcnlo copies it from Template/LO/Events/.keep) 
-#\cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/fbridge_common.inc ${dir}/SubProcesses # new file
-if [ "${patchlevel}" == "2" ]; then
-  cd ${dir}
-  echo "DEBUG: cd ${PWD}; patch -p4 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.common"
-  if ! patch -p4 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.common; then status=1; fi
-  \rm -f Source/*.orig
-  \rm -f bin/internal/*.orig
-  cd - > /dev/null
-fi
-for p1dir in ${dir}/SubProcesses/P*; do
-  cd $p1dir
-  #ln -sf ../fbridge_common.inc . # new file
-  #cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/counters.cc . # new file
-  #cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/ompnumthreads.cc . # new file
-  if [ "${patchlevel}" == "2" ]; then
-    echo "DEBUG: cd ${PWD}; patch -p6 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.P1"
-    if ! patch -p6 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.P1; then status=1; fi
-  fi
-  \rm -f *.orig
-  cd - > /dev/null
-done
-
-# Patch the default Fortran code to provide the integration with the cudacpp plugin
-# (2) Process-dependent patches
-cd ${dir}/Source/MODEL > /dev/null
-gcs=$(cat coupl_write.inc | awk '{if($1=="WRITE(*,2)") print $NF}') # different printouts for scalar/vector couplings #456
-for gc in $gcs; do
-  if grep -q "$gc(VECSIZE_MEMMAX)" coupl.inc; then
-    ###echo "DEBUG: Coupling $gc is a vector"
-    cat coupl_write.inc | awk -vgc=$gc '{if($1=="WRITE(*,2)" && $NF==gc) print $0"(1)"; else print $0}' > coupl_write.inc.new
-    \mv coupl_write.inc.new coupl_write.inc
-  ###else
-  ###  echo "DEBUG: Coupling $gc is a scalar"
-  fi
-done
-cd - > /dev/null
-
-# Patch the default cudacpp code to fix a bug in coloramps
-# ** NEW AUG 2023: DISABLING THE COLORAMPS PATCH FIXES THE LHE COLOR MISMATCH IN GG_TTGG (#655 and #713) **
-# (3) Process-dependent patches
-#for p1dir in ${dir}/SubProcesses/P*; do
-#  cd $p1dir
-#  cat coloramps.h | awk -vp=1 '{if (p==1) print $0; if ($1=="__device__") p=0}' > coloramps.h.new
-#  cat coloramps.inc | sed 's|)/|)/ {|' | sed 's|/$|}, /|' \
-#    | awk -vb= '{if($1~")/"){b=$2}; if($1=="$"){b=b$2}; if($3=="/"){print "    "b}}' \
-#    | sed 's/.TRUE./ true/g' | sed 's/.FALSE./ false/g' | sed 's/}/ }/' >> coloramps.h.new
-#  truncate -s -2 coloramps.h.new
-#  echo "">> coloramps.h.new
-#  cat coloramps.h | awk -vp=0 '{if ($1=="};") p=1; if (p==1) print $0}' >> coloramps.h.new
-#  \mv coloramps.h.new coloramps.h
-#done
-
-exit $status