diff --git a/.github/workflows/a100_profiler.yml b/.github/workflows/a100_profiler.yml new file mode 100644 index 0000000000..c95d4bfaaa --- /dev/null +++ b/.github/workflows/a100_profiler.yml @@ -0,0 +1,38 @@ +name: A100 Performance Profiler + +on: + schedule: + - cron: '00 00 * * *' + +jobs: + sycl_A100_Profiling: + name: SYCL A100 Profiling + env: + SYCL_NAME_PREFIX: sycl_AMD-Epyc-7313_a100_gcc-11.3_cuda-12.0.1 + ENABLE_CI_PROFILER: 1 + + MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }} + runs-on: [self-hosted, linux, a100] + steps: + - uses: actions/checkout@v2 + - name: Runs SYCL performanceProfiler.py script + run: cd tools/profiling/; + python3 performanceProfiler.py -l 'SYCL' -b 'master' + - name: Uploads SYCL JSON files to DB + run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master + + cuda_a100_Profiling: + name: CUDA A100 Profiling + env: + CUDA_NAME_PREFIX: cudacpp_AMD-Epyc-7313_a100_gcc-11.2.1_cuda-12.0.1 + ENABLE_CI_PROFILER: 1 + + MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }} + runs-on: [self-hosted, linux, a100] + steps: + - uses: actions/checkout@v2 + - name: Runs CUDA performanceProfiler.py script + run: cd tools/profiling/; + python3 performanceProfiler.py -l 'CUDA' -b 'master' + - name: Uploads CUDA JSON files to DB + run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master \ No newline at end of file diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index ec671246bd..b051702c17 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -11,9 +11,14 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum , epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum ] + folder: [ epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, + epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx, + epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg, + epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg, + epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ] fail-fast: false steps: + - uses: actions/checkout@v2 - name: make debug run: make -C ${{ matrix.folder }} debug @@ -50,11 +55,11 @@ jobs: - name: make check run: make AVX=none OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk check GPU: - runs-on: self-hosted + runs-on: [self-hosted, linux, a100] env: CUDA_HOME: /usr/local/cuda/ - FC: gfortran REQUIRE_CUDA: 1 + FC: gfortran strategy: matrix: folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum , epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ] @@ -62,8 +67,6 @@ jobs: fail-fast: false steps: - uses: actions/checkout@v2 - - name: path - run: echo "PATH=$PATH" - name: make info run: make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info - name: make diff --git a/.github/workflows/mi250x_profiler.yml b/.github/workflows/mi250x_profiler.yml new file mode 100644 index 0000000000..2a408e203c --- /dev/null +++ b/.github/workflows/mi250x_profiler.yml @@ -0,0 +1,80 @@ +name: MI250X Performance Profiler + +on: + push: + branches: [ gpu_abstraction ] + +jobs: + Container_Setup_and_Execution: + runs-on: [self-hosted, linux, a100] + name: Container Setup and Execution + steps: + - name: Generate runner token + id: generate_token + run: | + TOKEN=$(curl -XPOST -fsSL \ + -H "Authorization: token ${{ secrets.PAT }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \ + | grep -o '"token": *"[^"]*"' | cut -d '"' -f 4) + echo "token=$TOKEN" >> $GITHUB_OUTPUT + - name: SSH and run Docker container + env: + SSH_PRIVATE_KEY: ${{ secrets.SSH_KEY }} + MI250X_PROFILING_HOST: ${{ secrets.MI250X_PROFILING_HOST }} + MI250X_PROFILING_USER: ${{ secrets.MI250X_PROFILING_USER }} + HPC_ACCOUNT: ${{ secrets.HPC_ACCOUNT }} + HPC_PROJECT: ${{ secrets.HPC_PROJECT }} + SINGULARITY_CACHEDIR: /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ + SINGULARITY_TMPDIR: /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ + continue-on-error: true + run: | + echo "$SSH_PRIVATE_KEY" > id_rsa + chmod 600 id_rsa + ssh -o StrictHostKeyChecking=no -i id_rsa $MI250X_PROFILING_USER@$MI250X_PROFILING_HOST "\ + cd /scratch/$HPC_ACCOUNT/$MI250X_PROFILING_USER/ && \ + singularity pull --force oras://ghcr.io/${{ github.repository_owner }}/github_runner_mi250x:latest && \ + srun --account=$HPC_ACCOUNT -p $HPC_PROJECT --gpus=1 --time=03:00:00 singularity run --rocm \ + --env GITHUB_TOKEN=${{ steps.generate_token.outputs.token }} \ + --env REPO_URL=https://github.com/${{ github.repository }} \ + --env RUNNER_NAME=github_runner_mi250x \ + --env GITHUB_RUNNER_TAGS='Linux,x64,mi250x' \ + --env RUNNER_URL=https://github.com/actions/runner/releases/download/v2.303.0/actions-runner-linux-x64-2.303.0.tar.gz \ + github_runner_mi250x_latest.sif" + + HIP_MI250X_Profiling: + runs-on: [self-hosted, linux, mi250x] + name: HIP MI250X Profiling + env: + HIP_NAME_PREFIX: hip_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3 + ENABLE_CI_PROFILER: 1 + MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }} + steps: + - uses: actions/checkout@v2 + - name: Runs HIP performanceProfiler.py script + run: cd tools/profiling/; + python3 performanceProfiler.py -l 'HIP' -b 'master' + + - name: Uploads workplace_mg4gpu directory as an artifact + uses: actions/upload-artifact@v3 + with: + name: profiling-results + path: tools/profiling/workplace_mg4gpu + + Upload_JSON_files: + needs: HIP_MI250X_Profiling + runs-on: [self-hosted, linux] + name: Upload JSON files to DB + env: + HIP_NAME_PREFIX: hip_AMD-Epyc-7A53_MI250X_gcc-11.2.1_rocm-5.2.3 + ENABLE_CI_PROFILER: 1 + MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }} + steps: + - uses: actions/checkout@v2 + - name: Download artifact containing profiling data + uses: actions/download-artifact@v3 + with: + name: profiling-results + path: tools/profiling + - name: Uploads HIP JSON files to DB + run: cd tools/profiling; python3 sendData.py --absLayer HIP --profiler 1 --branch master \ No newline at end of file diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml new file mode 100644 index 0000000000..3af91957f8 --- /dev/null +++ b/.github/workflows/sycl.yml @@ -0,0 +1,49 @@ +name: SYCL CI + +on: + push: + branches: [ master ] + paths: + - 'epochX/sycl/**' + pull_request: + branches: [ master ] + paths: + - 'epochX/sycl/**' + +jobs: + GPU: + runs-on: [self-hosted, linux, a100] + env: + FC: gfortran + REQUIRE_CUDA: 1 + SYCLFLAGS: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -Xclang -fdenormal-fp-math=ieee + ENABLE_CI_PROFILER: 1 + strategy: + matrix: + folder: [ epochX/sycl/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum, + epochX/sycl/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx, + epochX/sycl/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg, + epochX/sycl/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg, + epochX/sycl/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg ] + precision: [ d , f ] + fail-fast: false + steps: + - uses: actions/checkout@v2 + - name: make info + run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; + source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; + CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++; + LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH; + make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} info + - name: make + run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; + source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; + CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++; + LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH; + make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} + - name: make check + run: source /cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/setup.sh; + source /cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/setvars.sh --include-intel-llvm; + CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++; + LD_LIBRARY_PATH=${{ github.workspace }}/${{ matrix.folder }}/../../lib:$LD_LIBRARY_PATH; + make FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} check \ No newline at end of file diff --git a/.github/workflows/v100s_profiler.yml b/.github/workflows/v100s_profiler.yml new file mode 100644 index 0000000000..a1cc4e710a --- /dev/null +++ b/.github/workflows/v100s_profiler.yml @@ -0,0 +1,39 @@ +name: V100s Performance Profiler + +on: + schedule: + - cron: '00 00 * * *' + +jobs: + + sycl_v100s_Profiling: + name: SYCL V100S Profiling + env: + SYCL_NAME_PREFIX: sycl_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1 + ENABLE_CI_PROFILER: 1 + + MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }} + runs-on: [self-hosted, linux, v100s] + steps: + - uses: actions/checkout@v2 + - name: Runs SYCL performanceProfiler.py script + run: cd tools/profiling/; + python3 performanceProfiler.py -l 'SYCL' -b 'master' + - name: Uploads SYCL JSON files to DB + run: cd tools/profiling/; python3 sendData.py --absLayer SYCL --profiler 1 --branch master + + cuda_v100s_Profiling: + name: CUDA V100S Profiling + env: + CUDA_NAME_PREFIX: cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-12.0.1 + ENABLE_CI_PROFILER: 1 + + MADGRAPH4GPU_DB_SECRET: ${{ secrets.MADGRAPH4GPU_DB_SECRET }} + runs-on: [self-hosted, linux, v100s] + steps: + - uses: actions/checkout@v2 + - name: Runs CUDA performanceProfiler.py script + run: cd tools/profiling/; + python3 performanceProfiler.py -l 'CUDA' -b 'master' + - name: Uploads CUDA JSON files to DB + run: cd tools/profiling/; python3 sendData.py --absLayer CUDA --profiler 1 --branch master \ No newline at end of file diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS index 8541e954b9..71519d1ad8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS @@ -10,6 +10,7 @@ generates includes the following authors: Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) @@ -28,5 +29,4 @@ acknowledged collaboration with the following collaborators: Taran Singhania (PES University Bangalore) David Smith (CERN) Carl Vuosalo (University of Wisconsin-Madison) - Joergen Teig (CERN) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc index 3c231bdbd6..54ce4c64cf 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -15,7 +15,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 0250c160ed..94b8dd6444 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -25,7 +25,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +85,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -155,7 +155,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -172,7 +172,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -196,7 +196,7 @@ namespace mg5amcCpu %(eftspecial2)s return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h rename to epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h index 1afc589b11..b4b76f3842 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index 48306a9d41..f29b8c5357 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_%(model_name)s.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc index 562af241af..b9840f1374 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for %(output_name)s by %(info_lines)s @@ -26,7 +26,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index b399eb36b0..dbca8e330f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index 25b6f8f7c8..2c084615d9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 1175622ff4..46a8f0efc0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) %(mgongpu_supports_multichannel)s +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 778e210468..815fd8d5b7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -14,7 +14,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" %(hel_amps_h)s #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 5f1ea36b9e..21a4c6aa74 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -4,13 +4,13 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== //========================================================================== // Class member functions for calculating the matrix elements for %(process_lines)s -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -44,7 +44,7 @@ namespace mg5amcCpu %(cipdhrdcod)s %(cipchrdcod)s #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL %(cipddevice)s %(cipcdevice)s #else @@ -54,7 +54,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -80,8 +80,8 @@ namespace mg5amcCpu // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283] // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 *** %(all_helicities)s -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -117,7 +117,7 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory %(cipdassign)s %(cipcassign)s -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL %(cipd2tipdSym)s %(cipc2tipcSym)s #else @@ -150,7 +150,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -215,12 +215,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -241,7 +241,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -367,9 +367,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -393,7 +393,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -413,7 +413,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 893f7f3215..2c3adf57e2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -23,7 +23,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -32,7 +32,7 @@ namespace mg5amcCpu %(process_class_definitions)s //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -45,7 +45,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -75,7 +75,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 1e473edcf8..960f029d8d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -4,7 +4,7 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== // *** COLOR CHOICE BELOW *** @@ -17,7 +17,7 @@ // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?) %(color_matrix_lines)s -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -74,7 +74,7 @@ #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -133,7 +133,7 @@ MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 159e3d8d5d..66450ae367 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -4,11 +4,14 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -36,7 +39,7 @@ // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -246,7 +249,7 @@ // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc index 2d1578cb43..dbe151e990 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt %% neppM == 0 ); // nevt must be a multiple of neppM assert( nevt %% neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 83b61a9565..3e0ebe545f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. import os @@ -1110,7 +1110,7 @@ def get_process_function_definitions(self, write=True): %(len(coupling_indep), ' ), cxmake( m_pars->'.join(coupling_indep)) # AV only indep! replace_dict['cipcdevice'] = '__device__ __constant__ fptype cIPC[%i];'%(2*len(coupling_indep)) replace_dict['cipcstatic'] = 'static fptype cIPC[%i];'%(2*len(coupling_indep)) - replace_dict['cipc2tipcSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) );'%len(coupling_indep) + replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipcdump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep) coup_str_hrd = '__device__ const fptype cIPC[%s] = { ' % (len(coupling_indep)*2) @@ -1121,7 +1121,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipcassign'] = '//const cxtype tIPC[0] = { ... }; // nicoup=0' replace_dict['cipcdevice'] = '__device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0' replace_dict['cipcstatic'] = 'static fptype* cIPC = nullptr; // unused as nicoup=0' - replace_dict['cipc2tipcSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) ); // nicoup=0'%len(coupling_indep) + replace_dict['cipc2tipcSym'] = '//gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep) replace_dict['cipc2tipc'] = '//memcpy( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep) replace_dict['cipcdump'] = '' replace_dict['cipchrdcod'] = '__device__ const fptype* cIPC = nullptr; // unused as nicoup=0' @@ -1130,7 +1130,7 @@ def get_process_function_definitions(self, write=True): %(len(params), ', (fptype)m_pars->'.join(params)) replace_dict['cipddevice'] = '__device__ __constant__ fptype cIPD[%i];'%(len(params)) replace_dict['cipdstatic'] = 'static fptype cIPD[%i];'%(len(params)) - replace_dict['cipd2tipdSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) );'%len(params) + replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipddump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params) param_str_hrd = '__device__ const fptype cIPD[%s] = { ' % len(params) @@ -1141,7 +1141,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipdassign'] = '//const fptype tIPD[0] = { ... }; // nparam=0' replace_dict['cipddevice'] = '//__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0' replace_dict['cipdstatic'] = '//static fptype* cIPD = nullptr; // unused as nparam=0' - replace_dict['cipd2tipdSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) ); // nparam=0'%len(params) + replace_dict['cipd2tipdSym'] = '//gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipddump'] = '' replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0' @@ -1219,13 +1219,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1252,7 +1252,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( \"calculate_wavefunctions: ihel=%2d\\n\", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( \"calculate_wavefunctions: ievt00=%d\\n\", ievt00 ); #endif""") nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() @@ -1289,7 +1289,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif""") ret_lines += helas_calls @@ -1718,8 +1718,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -1835,7 +1837,7 @@ def get_external(self, wf, argument): split_line2 = [ str.lstrip(' ').rstrip(' ') for str in split_line2] # AV split_line2.insert(2, '0') # add parameter fmass=0 line2 = ', '.join(split_line2) - text = '#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )\n %s\n#else\n if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n %s\n else\n %s\n#endif\n' # AV + text = '#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )\n %s\n#else\n if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n %s\n else\n %s\n#endif\n' # AV return text % (line, line, line2) text = '%s\n' # AV return text % line diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 5267141530..c89295c01f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin. import os import subprocess @@ -88,9 +88,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', - s+'CMake/src/CMakeLists.txt'], + s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', - s+'gpu/ompnumthreads.h', s+'gpu/CudaRuntime.h', + s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h', s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h', @@ -111,7 +111,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'CMake/SubProcesses/CMakeLists.txt'], 'test': [s+'gpu/cudacpp_test.mk']} to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h', - 'ompnumthreads.h', 'CudaRuntime.h', + 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 36b42987c5..dd0f31341f 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005498409271240234  +DEBUG: model prefixing takes 0.005403280258178711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.102 s +Wrote files for 8 helas calls in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.203 s +ALOHA: aloha creates 3 routines in 0.200 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.260 s +ALOHA: aloha creates 7 routines in 0.537 s FFV1 FFV1 FFV2 @@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.900s -user 0m1.697s -sys 0m0.195s +real 0m2.147s +user 0m1.627s +sys 0m0.231s Code generation completed in 2 seconds ************************************************************ * * @@ -277,7 +277,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -307,7 +307,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 9193aa2382..83e5b15013 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ }; __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() }; #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype cIPC[6]; #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -286,7 +287,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -343,7 +344,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -402,7 +403,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -449,8 +450,8 @@ namespace mg5amcCpu { 1, -1, 1, 1 }, { 1, -1, -1, -1 }, { 1, -1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -490,9 +491,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); @@ -529,7 +530,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -594,12 +595,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -620,7 +621,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -746,9 +747,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -772,7 +773,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -792,7 +793,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -806,9 +807,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -836,7 +840,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1046,7 +1050,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 77b610753c..0b29ffb3ff 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 9fa30cfd7f..e878fcd28e 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index 0b4be4d5ed..cffc5d3bff 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 64d0b8e761..2a6d960581 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -235,7 +235,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -259,7 +259,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h +++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 636fab0372..20d35a4a26 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00569605827331543  +DEBUG: model prefixing takes 0.005757331848144531  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.271 s +ALOHA: aloha creates 4 routines in 0.267 s FFV1 FFV1 FFV2 @@ -201,6 +201,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu quit real 0m0.662s -user 0m0.604s -sys 0m0.052s +user 0m0.596s +sys 0m0.051s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 87bcecccd9..13429436af 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ }; __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() }; #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype cIPC[6]; #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -284,7 +285,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -341,7 +342,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -400,7 +401,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -447,8 +448,8 @@ namespace mg5amcCpu { 1, -1, 1, 1 }, { 1, -1, -1, -1 }, { 1, -1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -488,9 +489,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); @@ -527,7 +528,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -592,12 +593,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -618,7 +619,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -744,9 +745,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -770,7 +771,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -790,7 +791,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -804,9 +805,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -834,7 +838,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1044,7 +1048,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 77b610753c..0b29ffb3ff 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 9fa30cfd7f..e878fcd28e 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index 0b4be4d5ed..cffc5d3bff 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 64d0b8e761..2a6d960581 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -235,7 +235,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -259,7 +259,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h +++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a477013568..75c84e12fb 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005816459655761719  +DEBUG: model prefixing takes 0.005261659622192383  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.155 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.131 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.729s -user 0m1.515s -sys 0m0.204s +real 0m1.690s +user 0m1.458s +sys 0m0.220s Code generation completed in 2 seconds ************************************************************ * * @@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..f20c229897 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +610,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +636,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +762,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +788,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +808,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +822,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +855,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 932f123fea..5f2f4391b9 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 0db09949ad..5542e5323b 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005459308624267578  +DEBUG: model prefixing takes 0.005713224411010742  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.145 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.545s -user 0m0.487s -sys 0m0.049s +real 0m0.623s +user 0m0.466s +sys 0m0.061s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index d390883453..e7dbb05570 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -299,7 +300,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -356,7 +357,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -415,7 +416,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -462,8 +463,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -503,9 +504,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -541,7 +542,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -606,12 +607,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -632,7 +633,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -758,9 +759,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -784,7 +785,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -804,7 +805,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -818,9 +819,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -848,7 +852,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1058,7 +1062,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index 932f123fea..5f2f4391b9 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.sa/src/rambo.h +++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index b3d319e039..f38b6ec6e6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005671977996826172  +DEBUG: model prefixing takes 0.005505561828613281  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s -Wrote files for 46 helas calls in 0.247 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.330 s +ALOHA: aloha creates 5 routines in 0.324 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.334s -user 0m2.083s -sys 0m0.238s -Code generation completed in 2 seconds +real 0m2.484s +user 0m2.030s +sys 0m0.256s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -312,7 +312,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -342,7 +342,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..f20c229897 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +610,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +636,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +762,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +788,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +808,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +822,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +855,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index bfab81142d..3c7715b235 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -830,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index 3901ddcb20..d4b3c0445c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 37ba5c7297..00ae96c5fb 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005791187286376953  +DEBUG: model prefixing takes 0.0055010318756103516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,14 +191,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s -Wrote files for 36 helas calls in 0.153 s +Wrote files for 36 helas calls in 0.184 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.325 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.208s -user 0m1.988s -sys 0m0.221s +real 0m2.571s +user 0m1.941s +sys 0m0.238s Code generation completed in 2 seconds ************************************************************ * * @@ -281,7 +281,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -311,7 +311,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index afeebde3c6..0e4d5d1157 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -830,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index adda711aad..ee1a51555d 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005533933639526367  +DEBUG: model prefixing takes 0.0054416656494140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.787s -user 0m0.730s -sys 0m0.049s -Code generation completed in 0 seconds +real 0m0.803s +user 0m0.731s +sys 0m0.066s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 2988a13b82..2e02593919 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -499,7 +500,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -556,7 +557,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -615,7 +616,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -678,8 +679,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -720,9 +721,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -759,7 +760,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -824,12 +825,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -850,7 +851,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -976,9 +977,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1002,7 +1003,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1022,7 +1023,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1036,9 +1037,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1066,7 +1070,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1276,7 +1280,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 2c2fae1608..3a2b1ad647 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057299137115478516  +DEBUG: model prefixing takes 0.0053348541259765625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.163 s +1 processes with 123 diagrams generated in 0.156 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.433 s -Wrote files for 222 helas calls in 0.711 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.437 s +Wrote files for 222 helas calls in 0.735 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.336 s +ALOHA: aloha creates 5 routines in 0.441 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -255,10 +255,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.329s -user 0m3.091s -sys 0m0.226s -Code generation completed in 4 seconds +real 0m3.582s +user 0m3.061s +sys 0m0.243s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -284,7 +284,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -314,7 +314,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 19bc1e7973..2f4b1f9d0e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2417,7 +2418,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2474,7 +2475,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2533,7 +2534,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2628,8 +2629,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2671,9 +2672,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2711,7 +2712,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2776,12 +2777,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2802,7 +2803,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2928,9 +2929,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -2954,7 +2955,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -2974,7 +2975,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -2988,9 +2989,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3018,7 +3022,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3228,7 +3232,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 04f7c62976..deb1358992 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 3c3686e228..1b6c420503 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005596637725830078  +DEBUG: model prefixing takes 0.005376100540161133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.166 s +1 processes with 123 diagrams generated in 0.156 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.442 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.427 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s +ALOHA: aloha creates 5 routines in 0.319 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.506s -user 0m1.438s -sys 0m0.059s -Code generation completed in 2 seconds +real 0m1.461s +user 0m1.381s +sys 0m0.050s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index f9016eaa88..d59cc349e3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2474,7 +2475,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2531,7 +2532,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2590,7 +2591,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2685,8 +2686,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2728,9 +2729,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2768,7 +2769,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2833,12 +2834,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2859,7 +2860,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2985,9 +2986,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -3011,7 +3012,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -3031,7 +3032,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -3045,9 +3046,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3075,7 +3079,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3285,7 +3289,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 04f7c62976..deb1358992 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 2480a22f8d..f222e5a6b5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005784511566162109  +DEBUG: model prefixing takes 0.005517005920410156  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.929 s +1 processes with 1240 diagrams generated in 1.861 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.718 s -Wrote files for 2281 helas calls in 18.893 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.528 s +Wrote files for 2281 helas calls in 18.450 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.322 s +ALOHA: aloha creates 5 routines in 0.314 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.319 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -257,9 +257,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.815s -user 0m29.332s -sys 0m0.380s +real 0m29.049s +user 0m28.554s +sys 0m0.393s Code generation completed in 30 seconds ************************************************************ * * @@ -286,7 +286,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -316,7 +316,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 19e6cd201c..a478ecb28e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g g WEIGHTED<=5 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -30018,7 +30019,7 @@ namespace mg5amcCpu { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -30075,7 +30076,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -30134,7 +30135,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -30293,8 +30294,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1, 1 }, { 1, 1, 1, -1, 1, 1, -1 }, { 1, 1, 1, -1, 1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -30337,9 +30338,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -30378,7 +30379,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -30443,12 +30444,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -30469,7 +30470,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -30595,9 +30596,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -30621,7 +30622,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -30641,7 +30642,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -30655,9 +30656,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -30685,7 +30689,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -30895,7 +30899,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 2565923dde..fff95b66e2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 0970bf8b4c..2720870321 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005753755569458008  +DEBUG: model prefixing takes 0.005664825439453125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.912 s +1 processes with 1240 diagrams generated in 1.872 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.716 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.352 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.290s -user 0m13.123s -sys 0m0.115s +real 0m12.978s +user 0m12.813s +sys 0m0.111s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index c2f8607428..fa23301c50 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g g WEIGHTED<=5 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -31908,7 +31909,7 @@ namespace mg5amcCpu { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -31965,7 +31966,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -32024,7 +32025,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -32183,8 +32184,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1, 1 }, { 1, 1, 1, -1, 1, 1, -1 }, { 1, 1, 1, -1, 1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -32227,9 +32228,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -32268,7 +32269,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -32333,12 +32334,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -32359,7 +32360,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -32485,9 +32486,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -32511,7 +32512,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -32531,7 +32532,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -32545,9 +32546,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -32575,7 +32579,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -32785,7 +32789,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 2565923dde..fff95b66e2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 2c0e77fafd..bb803498ee 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005677223205566406  +DEBUG: model prefixing takes 0.005455732345581055  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -Wrote files for 32 helas calls in 0.231 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +Wrote files for 32 helas calls in 0.216 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.364 s +ALOHA: aloha creates 2 routines in 0.143 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.137 s +ALOHA: aloha creates 4 routines in 0.130 s FFV1 FFV1 FFV1 @@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.934s -user 0m1.748s -sys 0m0.220s -Code generation completed in 3 seconds +real 0m1.916s +user 0m1.672s +sys 0m0.240s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -323,7 +323,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -353,7 +353,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 6242b019fa..a376b0c455 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 90788b2c75..41f17b9fb0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index cd4e6de668..45000c7246 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index c06dcbb252..8b92ea0bd6 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index a6eb185434..a3615ec77a 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h +++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index f659f6bb8d..5a07808142 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054836273193359375  +DEBUG: model prefixing takes 0.005926370620727539  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.082 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -211,7 +211,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.179 s FFV1 FFV1 FFV1 @@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.709s -user 0m0.586s -sys 0m0.064s -Code generation completed in 0 seconds +real 0m1.076s +user 0m0.601s +sys 0m0.061s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 90e90b3aa9..c1543791ca 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -336,7 +337,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -393,7 +394,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -452,7 +453,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -515,8 +516,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -557,9 +558,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -596,7 +597,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -661,12 +662,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -687,7 +688,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -813,9 +814,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -839,7 +840,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -859,7 +860,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -873,9 +874,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -903,7 +907,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1113,7 +1117,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 76c9403933..a9294d1fea 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -336,7 +337,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -393,7 +394,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -452,7 +453,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -515,8 +516,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -557,9 +558,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -596,7 +597,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -661,12 +662,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -687,7 +688,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -813,9 +814,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -839,7 +840,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -859,7 +860,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -873,9 +874,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -903,7 +907,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1113,7 +1117,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index cd4e6de668..45000c7246 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index c06dcbb252..8b92ea0bd6 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index a6eb185434..a3615ec77a 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h +++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 800492306f..9bac4b3aae 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.062 s +ALOHA: aloha creates 1 routines in 0.060 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.471s -user 0m0.367s -sys 0m0.052s -Code generation completed in 0 seconds +real 0m0.414s +user 0m0.350s +sys 0m0.059s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT +++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h index d65c9d6e04..85c3c9ed1c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h index 8109470148..78004e66cc 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_heft.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 526bd7d296..3b6085c784 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu //__device__ const fptype* cIPD = nullptr; // unused as nparam=0 __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0 __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -268,7 +269,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -325,7 +326,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -384,7 +385,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -419,8 +420,8 @@ namespace mg5amcCpu { -1, 1, 0 }, { 1, -1, 0 }, { 1, 1, 0 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -459,9 +460,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory //const fptype tIPD[0] = { ... }; // nparam=0 //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0 - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -495,7 +496,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -560,12 +561,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -586,7 +587,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -712,9 +713,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -738,7 +739,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -758,7 +759,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -772,9 +773,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -802,7 +806,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1012,7 +1016,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h index dbc5aa0e4e..e1caef360b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc index a1c3cdc238..688cb8167b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h index eae9ff5242..dbff117235 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc index e5442756b1..d3d6058b46 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index 790485fee0..c2be5bba97 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -28,7 +28,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -94,7 +94,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -230,7 +230,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -247,7 +247,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -298,7 +298,7 @@ namespace mg5amcCpu // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk index 0bd815c9b3..998d3c84fa 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index ff161c336f..adfd21027c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005424976348876953  +DEBUG: model prefixing takes 0.0053827762603759766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.139 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.876 s +65 processes with 1119 diagrams generated in 1.869 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.312 s -Wrote files for 810 helas calls in 3.308 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.297 s +Wrote files for 810 helas calls in 3.533 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.342 s +ALOHA: aloha creates 5 routines in 0.333 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.321 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.073s -user 0m8.514s -sys 0m0.464s +real 0m9.184s +user 0m8.370s +sys 0m0.508s Code generation completed in 9 seconds ************************************************************ * * @@ -1057,7 +1057,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -1087,7 +1087,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT +++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 7f14b5e299..40d8bdea5f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +610,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +636,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +762,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +788,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +808,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +822,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +855,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 448175be9d..f8a20b77fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 20496eaa70..5f57cf55f3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ WEIGHTED<=2 // Process: s s~ > t t~ WEIGHTED<=2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -279,7 +280,7 @@ namespace mg5amcCpu { 9, 3 }, { 3, 9 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -336,7 +337,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -395,7 +396,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -442,8 +443,8 @@ namespace mg5amcCpu { -1, 1, -1, -1 }, { -1, 1, 1, 1 }, { -1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -483,9 +484,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -521,7 +522,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -586,12 +587,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -612,7 +613,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -738,9 +739,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -764,7 +765,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -784,7 +785,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -798,9 +799,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -828,7 +832,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1038,7 +1042,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index e166fa1652..6498b91441 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index afeebde3c6..0e4d5d1157 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -830,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index b7e3475679..e098c03e3a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 0f999663da..7308f8a2c7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 87830582d7..b37df5d33f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ g WEIGHTED<=3 @1 // Process: s s~ > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { -1, 1, 1, 1, 1 }, { -1, 1, 1, -1, -1 }, { -1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index f8bdb38aee..fc7c0d8196 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 9051b3108d..b4df38fb35 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2417,7 +2418,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2474,7 +2475,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2533,7 +2534,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2628,8 +2629,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2671,9 +2672,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2711,7 +2712,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2776,12 +2777,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2802,7 +2803,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2928,9 +2929,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -2954,7 +2955,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -2974,7 +2975,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -2988,9 +2989,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3018,7 +3022,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3228,7 +3232,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 9f43559181..511b053c2a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 866433ae8b..bc38d1f109 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g g > t t~ d d~ WEIGHTED<=4 @2 // Process: g g > t t~ s s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, -1 }, { 1, 1, 1, -1, 1, 1 }, { 1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index f26b60c5bb..c411623fc8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 1be98364ee..a17bd3518e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ g d WEIGHTED<=4 @2 // Process: g s > t t~ g s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, -1, 1, -1, -1, 1 }, { 1, -1, 1, -1, 1, -1 }, { 1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 853175b477..9c820a5ddb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index dfb05016f5..6a53d09c8e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ g d~ WEIGHTED<=4 @2 // Process: g s~ > t t~ g s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, -1 }, { 1, 1, 1, -1, 1, 1 }, { 1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index e60cb5b6d7..a5a285b22d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index ecef3e57ca..fedf955b6a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -51,7 +50,7 @@ // Process: c s > t t~ c s WEIGHTED<=4 @2 // Process: d s > t t~ d s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +84,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -95,7 +94,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -123,13 +122,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -156,7 +155,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -192,7 +191,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -205,8 +204,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -387,7 +388,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -444,7 +445,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -503,7 +504,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -598,8 +599,8 @@ namespace mg5amcCpu { -1, -1, 1, -1, -1, 1 }, { -1, -1, 1, -1, 1, -1 }, { -1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -641,9 +642,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -681,7 +682,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -746,12 +747,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -772,7 +773,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -898,9 +899,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -924,7 +925,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -944,7 +945,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -958,9 +959,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -988,7 +992,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1198,7 +1202,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 5329710b87..8c84687f8a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -112,7 +112,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -125,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -155,7 +155,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index e4f9dee3a2..fc99b3bfae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -57,7 +56,7 @@ // Process: s c~ > t t~ s c~ WEIGHTED<=4 @2 // Process: s d~ > t t~ s d~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +90,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -101,7 +100,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -129,13 +128,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -162,7 +161,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -198,7 +197,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -211,8 +210,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -393,7 +394,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -450,7 +451,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -509,7 +510,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -604,8 +605,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -647,9 +648,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -687,7 +688,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -752,12 +753,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -778,7 +779,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -904,9 +905,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -930,7 +931,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -950,7 +951,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -964,9 +965,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -994,7 +998,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1204,7 +1208,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index 391789dc81..da747c3465 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -118,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -131,7 +131,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -161,7 +161,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 302d63e31d..97912e5855 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d > t t~ d d WEIGHTED<=4 @2 // Process: s s > t t~ s s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { -1, -1, 1, -1, -1, 1 }, { -1, -1, 1, -1, 1, -1 }, { -1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -856,12 +857,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +883,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1009,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1035,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1069,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1102,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1312,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 2d95f4b170..d8232ea652 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index d0be5131af..be2315b035 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -57,7 +56,7 @@ // Process: s s~ > t t~ c c~ WEIGHTED<=4 @2 // Process: s s~ > t t~ d d~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +90,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -101,7 +100,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -129,13 +128,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -162,7 +161,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -198,7 +197,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -211,8 +210,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -393,7 +394,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -450,7 +451,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -509,7 +510,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -604,8 +605,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -647,9 +648,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -687,7 +688,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -752,12 +753,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -778,7 +779,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -904,9 +905,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -930,7 +931,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -950,7 +951,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -964,9 +965,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -994,7 +998,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1204,7 +1208,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 14490d782f..71fdc6e547 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -118,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -131,7 +131,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -161,7 +161,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 3a2178d534..c83b7be449 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ g g WEIGHTED<=4 @2 // Process: s s~ > t t~ g g WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, 1 }, { -1, 1, 1, -1, 1, -1 }, { -1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 1543c29649..e9a24f516d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 70fbbee59f..3ecdb48914 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ d d~ WEIGHTED<=4 @2 // Process: s s~ > t t~ s s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -856,12 +857,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +883,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1009,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1035,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1069,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1102,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1312,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 58cece5c62..d8d3d481ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 7df13a2341..e21d1f0c48 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -51,7 +50,7 @@ // Process: c~ s~ > t t~ c~ s~ WEIGHTED<=4 @2 // Process: d~ s~ > t t~ d~ s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +84,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -95,7 +94,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -123,13 +122,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -156,7 +155,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -192,7 +191,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -205,8 +204,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -387,7 +388,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -444,7 +445,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -503,7 +504,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -598,8 +599,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -641,9 +642,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -681,7 +682,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -746,12 +747,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -772,7 +773,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -898,9 +899,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -924,7 +925,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -944,7 +945,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -958,9 +959,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -988,7 +992,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1198,7 +1202,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 6bd3135c3c..901c6dfcc9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -112,7 +112,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -125,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -155,7 +155,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index f464c27160..527b1d3c8f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d~ d~ > t t~ d~ d~ WEIGHTED<=4 @2 // Process: s~ s~ > t t~ s~ s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -856,12 +857,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +883,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1009,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1035,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1069,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1102,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1312,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 4e53fa1250..c2ca443c0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/tools/profiling/README.md b/tools/profiling/README.md new file mode 100644 index 0000000000..1a5251d93b --- /dev/null +++ b/tools/profiling/README.md @@ -0,0 +1,163 @@ +# Documentation + +We are currently using [GitHub Actions](https://docs.github.com/en/actions) in conjunction with onsite self-hosted [GitHub Runners](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners) to automate compiling/testing and performance profiling tasks in SYCL and CUDA on A100 and V100s GPUs currently. + +## Grafana link: [madgraph4gpu-db.web.cern.ch](https://madgraph4gpu-db.web.cern.ch/) + +## Performance Profiling + +### Profiling baseline currently used + +**GCC - 11.3.0** + +**CUDA - 12.0.1** + +**Clang - 16** + +### GitHub Actions Runner + +A [GitHub Runner](https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners) is a tool that allows users to automate their workflow by running [actions](https://docs.github.com/en/actions) or tasks in response to specific events on GitHub. This can include tasks such as running tests, building and deploying code, or publishing artifacts. They can be easily configured and managed through the GitHub website, and can help users streamline their development process and ensure that their code is always up-to-date and ready for deployment. In our case we use them to automate CI and nightly performance profiling. + +### performanceProfiler.py + +This is the main entrypoint for the profiler. It executes the two bash build scripts for SYCL ```buildSYCLProcess.sh``` and CUDA ```buildCUDAProcess.sh``` with the correct ThreadsPerBlock, BlocksPerThread and iteration count. + +#### Usage: + +Go to the `tools/profiling` directory and run: + +``` +python3 performanceProfiler.py -l -b +``` + +The following options are available for this script: + +`-l`: This option specifies the abstraction layer to use for profiling. The supported values are "SYCL" and "CUDA". The default value is "SYCL". + +`-b`: This option specifies the branch of the madgraph4gpu repository that will be used. The default value is "master". + +Example: + +Copy code +python script.py +To run the script with a different abstraction layer and branch, you can use the following command: + +``` +python script.py -l CUDA -b my_branch +``` + +### buildSYCLProcess.sh + +This bash script compiles and executes standalone physics processes using the MadGraph5_aMC@NLO GPU development framework with oneAPI/SYCL. + +#### Usage + +Go to the `tools/profiling` directory and run: + +``` +./buildSYCLProcess.sh -n -b -t -i [-r ] [-d ] +``` + +#### Arguments: + +* `-n`: Name of the physics process being built and run (e.g., gg_ttgg). + +* `-b`: Number of blocks per grid. + +* `-t`: Number of threads per block. + +* `-i`: Number of iterations. + +* `-r`: (Optional) Branch name. Default: not displayed in the report folder prefix. + +* `-d`: (Optional) Flag for setting the device ID. Default: "--device_id 2" for oneAPI toolkit runs on GPUs, otherwise "--device_id 0" for LLVM DPCPP compiler. You can also use `-d info` to get the specific device IDs for that host. + +#### Example: + +``` +./buildSYCLProcess.sh -n gg_ttgg -b 1024 -t 128 -i 10 -r master -d 2 +``` + +**Note**: + +To also compile to CPUs you need to enable more backends in the DPCPP toolchain (Currently when you follow how to use the LLVM DPCPP compiler for CUDA it does not install the necessary dependencies to see other devices as well on the host). You can read more on how to enable more backends [here](https://intel.github.io/llvm-docs/GetStartedGuide.html#build-dpc-toolchain). + +### buildCUDAProcess.sh + +This script compiles and executes physics processes using the MadGraph5_aMC@NLO GPU development framework with CUDA. + +#### Usage + +Go to the `tools/profiling` directory and run: + +``` +./buildCUDAProcess.sh -n -b -t -i -r -m +``` + +#### Arguments: + +* `-n`: Name of the physics process being built and run. + +* `-b`: Number of blocks per grid. + +* `-t`: Number of threads per block. + +* `-i`: Number of iterations. + +* `-r`: Branch name. + +* `-m`: Makefile arguments. + +#### Example: + +``` +./buildCUDAProcess.sh -n gg_ttgg -b 1024 -t 128 -i 10 -r master -m avx2 +``` + +#### Notes + +This script assumes that it is run from the profiling directory in the repository. +Make sure to set the correct CUDA path according to your system. +You may need to modify the script to set the correct GPU architecture or compiler options depending on your system. + +### sendData.py + +#### Usage: + +Go to the `tools/profiling` directory and run: + +``` +python3 sendData.py -r -branch +``` + +The following arguments are available for this script: + +* `-r` or `--reportPath```: This argument specifies the path for the reports that will be sent to the database. + +* `-f` or `--fields`: This argument specifies the fields in the JSON data that will be sent to the database. The default value is `['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)']`. + +* `-b` or `--branch`: This argument specifies the branch that the profiler data is in. The default value is `master`. + +* `-p` or `--profiler`: This argument enables CI profiling defaults. The default value is `False`. + +For example, to run the script with the default arguments, you can use the following command: + +python3 sendData.py +To run the script with a custom report path and branch, you can use the following command: + +python3 sendData.py -r /path/to/reports -b my_branch +Note that some options may not be relevant or may not work as expected in certain situations. For example, the -p option will only work when CI profiling defaults are enabled. + +## Known issues: + +### Bug in GCC 11.3.0/11.3.1 using the LLVM DPCPP compiler + +There is a [bug](https://bugs.gentoo.org/842405) affecting GCC versions 11.3.0/11.3.1 when compiling the standalone physics processes resulting in two compilation errors `.../fs_path.h:1209:9: error: 'end' is missing exception specification 'noexcept'` and `.../fs_path.h:1217:9: error: 'end' is missing exception specification 'noexcept'`` in the `fs_path.h` file. GCC version 11.2.0 is not affected, and appears to be fixed in later versions (Remains to be tested and cited). + +### libmg5amc_common.so: cannot open shared object file: No such file or directory + +The libmg5amc_common.so library is not set in the LD_LIBRARY_PATH + +### Not linking correctly/Wrong linker version from what you intend to compile with? + +If you have problems with wrong linker see which candidate GCC finds with `./sycl_workspace/llvm/build/bin/clang++ -v` and see if it is the correct GCC candidate. If it is not, you can correct this with adding `--gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/11.3.0-ad0f5/x86_64-centos8/lib/gcc/x86_64-pc-linux-gnu/11.3.0` to the `CXXFLAGS`. This will correctly set the GCC candidate to the desired GCC installation. Using `ENABLE_CI_PROFILER=1` automatically adds this in all the standalone physics processes makefiles in SYCL and in CUDA. diff --git a/tools/profiling/buildCUDAProcess.sh b/tools/profiling/buildCUDAProcess.sh new file mode 100755 index 0000000000..0923aca9ab --- /dev/null +++ b/tools/profiling/buildCUDAProcess.sh @@ -0,0 +1,135 @@ +#!/bin/bash + +# +# __ __ _ ____ _ _ _ ____ ____ _ _ +# | \/ | __ _ __| | / ___| _ __ __ _ _ __ | |__ | || | / ___| | _ \ | | | | +# | |\/| | / _` | / _` | | | _ | '__| / _` | | '_ \ | '_ \ | || |_ | | _ | |_) | | | | | +# | | | | | (_| | | (_| | | |_| | | | | (_| | | |_) | | | | | |__ _| | |_| | | __/ | |_| | +# |_| |_| \__,_| \__,_| \____| |_| \__,_| | .__/ |_| |_| |_| \____| |_| \___/ +# |_| +# +# +# Bash script for compiling and executing physics processes using the MadGraph5_aMC@NLO GPU development framework +# using CUDA/HIP +# +# Author: Jorgen Teig, CERN 2023 +# + +helpFunction() +{ + echo "" + echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10" + echo -e "\t-n Name of the physics process being built and run" + echo -e "\t-b Blocks per grid" + echo -e "\t-t Threads per block" + echo -e "\t-i Iterations" + echo -e "\t-r Branch" + echo -e "\t-m Makefile arguments" + exit 1 # Exit script after printing help +} + +while getopts "n:b:t:i:r:m:a:" opt +do + case "$opt" in + n ) MG_PROC="$OPTARG" ;; #process to target + b ) blocksPerGrid="$OPTARG" ;; + t ) threadsPerBlock="$OPTARG" ;; + i ) iterations="$OPTARG" ;; + r ) branch="$OPTARG" ;; + m ) makeArgs="$OPTARG" ;; + ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent + esac +done + +# Print helpFunction in case parameters are empty +if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ] +then + echo "Some or all of the parameters are empty"; + helpFunction +fi + +# Begin script in case all parameters are correct + +# Added check if the CUDA_NAME_PREFIX/HIP_NAME_PREFIX variable are not set +if [ -z "$CUDA_NAME_PREFIX" -o -z "$HIP_NAME_PREFIX" ]; then + echo "WARNING: CUDA_NAME_PREFIX/HIP_NAME_PREFIX is not set. Cannot append system info to JSON file names!" +fi + +################################################################## + +# Set variables for later use + +# CUDA +# Check if CUDA_HOME has not been set from the outside, usefull in CI/CD +if [[ -z "$CUDA_HOME" ]]; then + COMPILER=$(which nvcc 2>/dev/null) + while [ -L "$COMPILER" ]; do + COMPILER=$(readlink "$COMPILER") + done + export COMPILER_PATH=$COMPILER + + if [[ "$COMPILER_PATH" ]]; then + export CUDA_HOME=$(dirname $(dirname $COMPILER_PATH)) + export PATH=$CUDA_HOME${PATH:+:${PATH}} + fi +fi + +# HIP +# Check if HIP_HOME has not been set from the outside, usefull in CI/CD +if [[ -z "$HIP_HOME" ]]; then + COMPILER=$(which hipcc 2>/dev/null) + while [ -L "$COMPILER" ]; do + COMPILER=$(readlink "$COMPILER") + done + export COMPILER_PATH=$COMPILER + + if [[ "$COMPILER_PATH" ]]; then + export HIP_HOME=$(dirname $(dirname $COMPILER_PATH)) + export PATH=$HIP_HOME${PATH:+:${PATH}} + fi +fi + +# Prefix for saving the JSON files in workspace folder in the tools/profiling directory +prefix="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" + +export USEBUILDDIR=1 +export NTPBMAX=1024 +export CXX=`which g++` +export FC=`which gfortran` + +export MG_EXE="./gcheck.exe" #GPU +#export MG_EXE="./check.exe" #CPU + +export WORKSPACE=$prefix/workspace_mg4gpu + +REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${CUDA_NAME_PREFIX}_${branch}" + +mkdir $WORKSPACE 2>/dev/null; true +mkdir $REPORT_FOLDER 2>/dev/null; true + +export MG_PROC_DIR=$prefix/../../epochX/cudacpp/$MG_PROC +export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_* +export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards + +# Build executable + +cd $MG_SP_DIR +make -j $makeArgs + +# Run executable + +cd build.${makeArgs:3}* +mkdir -p perf/data/ 2>/dev/null; true +$MG_EXE -j $blocksPerGrid $threadsPerBlock $iterations + +echo "${MG_EXE} -j ${blocksPerGrid} ${threadsPerBlock} ${iterations}" + +cd perf/data/ + +if [ -n "$CUDA_NAME_PREFIX" ]; then + mv 0-perf-test-run0.json "${REPORT_FOLDER}/test_${MG_PROC}_${CUDA_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json" +elif [ -n "$HIP_NAME_PREFIX" ]; then + mv 0-perf-test-run0.json "${REPORT_FOLDER}/test_${MG_PROC}_${HIP_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json" +else + mv 0-perf-test-run0.json "${REPORT_FOLDER}/test_${MG_PROC}_undefined_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json" +fi \ No newline at end of file diff --git a/tools/profiling/buildSYCLProcess.sh b/tools/profiling/buildSYCLProcess.sh new file mode 100755 index 0000000000..efdfd66c71 --- /dev/null +++ b/tools/profiling/buildSYCLProcess.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +# +# __ __ _ ____ _ _ _ ____ ____ _ _ +# | \/ | __ _ __| | / ___| _ __ __ _ _ __ | |__ | || | / ___| | _ \ | | | | +# | |\/| | / _` | / _` | | | _ | '__| / _` | | '_ \ | '_ \ | || |_ | | _ | |_) | | | | | +# | | | | | (_| | | (_| | | |_| | | | | (_| | | |_) | | | | | |__ _| | |_| | | __/ | |_| | +# |_| |_| \__,_| \__,_| \____| |_| \__,_| | .__/ |_| |_| |_| \____| |_| \___/ +# |_| +# +# +# Bash script for compiling and executing physics processes using the MadGraph5_aMC@NLO GPU development framework +# using oneAPI/SYCL +# +# Author: Jorgen Teig, CERN 2023 +# + +helpFunction() +{ + echo "" + echo "Usage: $0 -n gg_ttgg -b 1024 -t 128 -i 10" + echo -e "\t-n Name of the physics process being built and run" + echo -e "\t-b Blocks per grid" + echo -e "\t-t Threads per block" + echo -e "\t-i Iterations" + echo -e "\t-r Branch" + echo -e "\t-d Flag for setting device id" + exit 1 # Exit script after printing help +} + +while getopts "n:b:t:i:r:d:" opt +do + case "$opt" in + n ) MG_PROC="$OPTARG" ;; #process to target + b ) blocksPerGrid="$OPTARG" ;; + t ) threadsPerBlock="$OPTARG" ;; + i ) iterations="$OPTARG" ;; + r ) branch="$OPTARG" ;; + d ) DEVICE_ID="$OPTARG" ;; + ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent + esac +done + +# Print helpFunction in case parameters are empty +if [ -z "${MG_PROC}" ] || [ -z "${blocksPerGrid}" ] || [ -z "${threadsPerBlock}" ] || [ -z "${iterations}" ] +then + echo "Some or all of the parameters are empty"; + helpFunction +fi + +# Added check if the SYCL_NAME_PREFIX variable are not set +if [ -z "$SYCL_NAME_PREFIX" ]; then + echo "WARNING: SYCL_NAME_PREFIX is not set. Cannot append system info to JSON file names!" +fi + +################################################################## + +# Assign correct SM level for NVIDIA GPUs + +# Check if nvidia-smi command exists +if command -v nvidia-smi > /dev/null 2>&1; then + + # Get the name of the GPU + GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader) + + # GPU (DEVICE_ID=2 for oneAPI toolkit runs on GPUs, else DEVICE_ID=0 with LLVM compiler) + export DEVICE_ID=2 + # CPU + #export DEVICE_ID=1 +else + echo "nvidia-smi non existent on system, Nvidia GPU possibly not present!" + exit +fi + +case $GPU_NAME in + *V100S* ) export SM_LEVEL="sm_70" ;; + *A100* ) export SM_LEVEL="sm_80" ;; +esac + +################################################################## + +# Begin script in case all parameters and GPU specific settings are set + +################################################################## + +# Set variables for later use + +# Assumes that this is run from profiling directory in the repo +prefix=$(pwd) + +export USEBUILDDIR=1 +export NTPBMAX=1024 +export CUDA_PATH=/usr/local/cuda-12.0/ +export WORKSPACE=$prefix/workspace_mg4gpu + +export CXTYPE="thrust" + +# Old SYCLFLAGS +# export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=$SM_LEVEL' -fgpu-rdc --cuda-path=$CUDA_PATH" + +export SYCLFLAGS="-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xcuda-ptxas --maxrregcount=255 -Xcuda-ptxas --verbose -Xsycl-target-backend --cuda-gpu-arch=$SM_LEVEL" + +# Compilation using OneAPI Toolkit through CVMFS +#export CXX=/cvmfs/projects.cern.ch/intelsw/oneAPI/linux/x86_64/2023/compiler/2023.0.0/linux/bin-llvm/clang++ + +# Compilation with LLVM DPC++ compiler +export DPCPP_HOME=/afs/cern.ch/work/j/jteig/sycl_workspace +export CXX=$DPCPP_HOME/llvm/llvm-20230418-fea99cc9ad67-gcc-11.2.1-cuda-12.0/bin/clang++ + +# Sets CUDA in PATH +export PATH=$CUDA_HOME:$PATH + +# Branch should be enviroment variable in main script and then passed down if none then it is not displayed in prefix +REPORT_FOLDER="${WORKSPACE}/$(date +"%y-%m-%d")_${SYCL_NAME_PREFIX}_${branch}" + +mkdir -p $WORKSPACE/mg4gpu/lib 2>/dev/null; true +mkdir -p $WORKSPACE/mg4gpu/bin 2>/dev/null; true +mkdir $REPORT_FOLDER 2>/dev/null; true + +export MG4GPU_LIB=$WORKSPACE/mg4gpu/lib +export MG4GPU_BIN=$WORKSPACE/mg4gpu/bin + +export MG_PROC_DIR=$prefix/../../epochX/sycl/$MG_PROC +export MG_SP_DIR=$MG_PROC_DIR/SubProcesses/P1_* + +export MG_LIBS_DIR="${MG4GPU_LIB}/build_${MG_PROC}_${SYCL_NAME_PREFIX}" + +if [[ -z "${DPCPP_HOME}" ]]; then + export MG_LIBS="$MG_LIBS_DIR" +else + export MG_LIBS="$DPCPP_HOME/llvm/build/lib:$MG_LIBS_DIR" +fi + +export MG_EXE_DIR="${MG4GPU_BIN}/build_${MG_PROC}_${SYCL_NAME_PREFIX}" +export MG_EXE="$MG_EXE_DIR/check.exe" +export MG5AMC_CARD_PATH=$MG_PROC_DIR/Cards + +# Build executable +cd $MG_SP_DIR +make -j build.d_inl0_hrd1/check.exe +mv -f ../../lib/build.*/ $MG_LIBS_DIR #2>/dev/null; true +mv -f build.*/ $MG_EXE_DIR + +# Run executable +cd $WORKSPACE + +if [ $DEVICE_ID == "info" ]; then + # Add MG Libs to linker library path and display the devices + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE --param_card $MG5AMC_CARD_PATH/param_card.dat --device_info 32 32 10 + +else + # Add MG Libs to linker library path and run the executable + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MG_LIBS $MG_EXE -j --json_file ${REPORT_FOLDER}/test_${MG_PROC}_${SYCL_NAME_PREFIX}_${blocksPerGrid}_${threadsPerBlock}_${iterations}.json --param_card $MG5AMC_CARD_PATH/param_card.dat --device_id $DEVICE_ID $blocksPerGrid $threadsPerBlock $iterations +fi diff --git a/tools/profiling/container-README.md b/tools/profiling/container-README.md new file mode 100644 index 0000000000..782186d0e9 --- /dev/null +++ b/tools/profiling/container-README.md @@ -0,0 +1,7 @@ +podman build --tag github_runner . && \ +podman run --security-opt=label=disable -d=true \ +--env GITHUB_TOKEN=AFPDS6753IL4TZY3PPHNNZLEUWJHA \ +--env REPO_URL=https://github.com/Jooorgen/madgraph4gpu \ +--env GITHUB_RUNNER_TAGS=Linux,x64,a100 \ +--env RUNNER_NAME=GPURunner_itscrd-a100 \ +--name github_runner github_runner \ No newline at end of file diff --git a/tools/profiling/containerSetup.sh b/tools/profiling/containerSetup.sh new file mode 100644 index 0000000000..11dc0774b0 --- /dev/null +++ b/tools/profiling/containerSetup.sh @@ -0,0 +1,98 @@ +# Preliminary setup +podman=${podman:-podman} +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +runnerName=GPURunner_itscrd-a100 +sourceImage=nvidia/cuda:12.0.1-devel-rockylinux8 +tag=githubci-cuda12.0.1-gcc11.3-clang +GitHubRunnerTags=Linux,x64,a100 +githubToken=$1 + +# Links +runnerURL=https://github.com/actions/runner/releases/download/v2.301.1/actions-runner-linux-x64-2.301.1.tar.gz +nvidiaContainerToolkitLink=https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo +repoURL=https://github.com/Jooorgen/madgraph4gpu + +if ! which podman > /dev/null; then + echo "Podman not installed. Trying now ..." + sudo yum install podman + curl -s -L $nvidiaContainerToolkitLink > nvidia-container-runtime.repo + sudo mv nvidia-container-runtime.repo /etc/yum-puppet.repos.d/ + sudo yum install nvidia-container-runtime + + sudo sed -i 's/^#no-cgroups = false/no-cgroups = true/;' /etc/nvidia-container-runtime/config.toml + exit 0 +fi + +if $runTest; then + # Test that container starts up + $podman run --rm --security-opt=label=disable nvidia/cuda:11.5.0-devel-centos8 nvidia-smi || exit 1 +fi + +cat > entrypoint.sh << "EOF" +#!/bin/bash +RUNNER=/home/CI/actions-runner/run.sh + +while true; do + if ! pgrep -f ${RUNNER} > /dev/null 2>&1; then + # Runner hasn't been started yet or exited because of failure / update + ${RUNNER} + else + # Runner was restarted, and is running in background. Let's wait for it. + PID=$(pgrep -f ${RUNNER}) && tail --pid=$PID -f /dev/null + fi + sleep 10 +done +EOF + +# In container: +# - install cmake, git, which +cat > containerManifest < DOUBLE_PRECISION_CONSTANT: + + if pyArgs.l.upper() == 'SYCL': + + # There is no .sa in br_golden_epochX4 + # so it makes sure that .sa is included in everything other than that branch + # if pyArgs.b != 'br_golden_epochX4': + #if ".sa" not in process: + # process = process + ".sa" + + bashArgs = ["./buildSYCLProcess.sh", + "-n", process, + "-i", str(ITERATIONS), + "-t", str(TPB), + "-b", str(BPG), + "-r", str(pyArgs.b).lower()] + + elif pyArgs.l.upper() == 'CUDA' or pyArgs.l.upper() == 'HIP': + + bashArgs = ["./buildCUDAProcess.sh", + "-n", process, + "-i", str(ITERATIONS), + "-t", str(TPB), + "-b", str(BPG), + "-r", str(pyArgs.b).lower()] + + else: sys.exit("No abstraction layer matching the supplied string!") + + time = str(datetime.datetime.now().strftime("%H:%M:%S")) + + print(time + " Started " + process + " with TPB("+ str(TPB) +") * BPG("+ str(BPG) +"): " + str(TPB * BPG) + "!") + + build = subprocess.run(bashArgs, check=True)#, stdout=subprocess.DEVNULL) + if build.returncode != 0: + print(time + " " + process + + " FAILED!, threadsPerBlock: " + str(TPB) + + ", blocksPerGrid: " + str(BPG) + + ", Product: " + str(TPB * BPG)) + else: + print(time + " " + process + + " COMPLETED!, threadsPerBlock: " + str(TPB) + + ", blocksPerGrid: " + str(BPG) + + ", Product: " + str(TPB * BPG)) + + count += 1 + +print("Builded " + str(count) + " processes!") \ No newline at end of file diff --git a/tools/profiling/profileconfig.ini b/tools/profiling/profileconfig.ini index a233430420..09ccf41895 100755 --- a/tools/profiling/profileconfig.ini +++ b/tools/profiling/profileconfig.ini @@ -3,12 +3,12 @@ make = make #____________________________________________ #REMOVE # whether you want to execute gcheck.exe or ccheck.exe #sys = ccheck.exe -sys = gcheck.exe +#sys = gcheck.exe # check.exe still in development -#sys = check.exe +sys = check.exe #____________________________________________ -epoch = epoch2 -abstr_layer = cuda +epoch = epochx +abstr_layer = sycl process = ee_mumu sigma = P1_Sigma_sm_epem_mupmum #process = gg_ttgg @@ -27,9 +27,10 @@ threads_max = 4 # creats a plot with (NumThreadsPerBlock * BlocksPerGrid) # # on the x-axis # ################################################################## -EvtsPerSec[Rnd+Rmb+ME](123) = on -EvtsPerSec[Rmb+ME] (23) = on +EvtsPerSec[Rnd+Rmb+ME](123) = off +EvtsPerSec[Rmb+ME] (23) = off EvtsPerSec[MatrixElems] (3) = on +EvtsPerSec[MECalcOnly] (3) = on NumMatrixElements(notNan) = off MatrixElemEventsPerSec = off NumIterations = off diff --git a/tools/profiling/sendData.py b/tools/profiling/sendData.py new file mode 100644 index 0000000000..7d7da7d9b7 --- /dev/null +++ b/tools/profiling/sendData.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# +# __ __ _ ____ _ _ _ ____ ____ _ _ +# | \/ | __ _ __| | / ___| _ __ __ _ _ __ | |__ | || | / ___| | _ \ | | | | +# | |\/| | / _` | / _` | | | _ | '__| / _` | | '_ \ | '_ \ | || |_ | | _ | |_) | | | | | +# | | | | | (_| | | (_| | | |_| | | | | (_| | | |_) | | | | | |__ _| | |_| | | __/ | |_| | +# |_| |_| \__,_| \__,_| \____| |_| \__,_| | .__/ |_| |_| |_| \____| |_| \___/ +# |_| +# +# +# Python script for sending generated reports from performance profiling to InfluxDB instance +# using the MadGraph5_aMC@NLO GPU development framework +# +# Author: Jorgen Teig, CERN 2023 +# + +import os +import glob +import json +import re +import logging +import subprocess +import datetime +import argparse +import sys + +# Parameter defaults +URL = 'https://dbod-madgraph4gpu-db.cern.ch:8082/api/v2/write?bucket=ProfilerData' +secret = os.environ.get('MADGRAPH4GPU_DB_SECRET') +AUTH = ['db_user', secret] +PHYS_PROCESSES = ['ee_mumu', 'gg_ttggg', 'gg_ttgg', 'gg_ttg', 'gg_tt'] +ABS_LAYERS = ['SYCL', 'CUDA', 'HIP'] +BRANCH = 'master' +FIELDS = ['EvtsPerSec[MatrixElems] (3)', 'EvtsPerSec[MECalcOnly] (3)'] + +# Default reportPath (Useful for testing) +REPORT_PATH = 'C:\\Users\\jteig\\cernbox\\Documents\\test\\22-12-07_cudacpp_Xeon-Silver-4216_v100s_gcc-11.3_cuda-11.6.2_master' + +# Argument parser +parser = argparse.ArgumentParser(description='A script for sending data from profiler to InfluxDB.') + +parser.add_argument('-r', '--reportPath', help="Path for the reports that is being put into the database.", default=REPORT_PATH) +parser.add_argument('-f', '--fields', help="Fields in the JSON to be put into the database.", default=FIELDS) +parser.add_argument('-a', '--absLayer', help="Abstraction layer used when profiling.", default=ABS_LAYERS[0]) +parser.add_argument('-b', '--branch', help="Branch the profiler data is in.", default=BRANCH) +parser.add_argument('-p', '--profiler', help="Enable CI profiling defaults.", default='0') + +args = parser.parse_args() + +# +# Main +# +if __name__=='__main__': + + # Sets report path for extracting the reports generated from performanceProfiler.py + if args.profiler == '1': + + if args.absLayer.upper() == "SYCL": + + syclNamePrefix = os.getenv('SYCL_NAME_PREFIX') + + if syclNamePrefix is None: + logging.error('Sycl name prefix has not been set!') + sys.exit(1) + + reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + syclNamePrefix + '_' + args.branch + + if not os.path.exists(reportfolder): + logging.error('SYCL report path does not exist!') + sys.exit(1) + + elif args.absLayer.upper() == "CUDA": + + cudaNamePrefix = os.getenv('CUDA_NAME_PREFIX') + + if cudaNamePrefix is None: + logging.error('Cuda name prefix has not been set!') + sys.exit(1) + + reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + cudaNamePrefix + '_' + args.branch + + if not os.path.exists(reportfolder): + logging.error('CUDA report path does not exist!') + sys.exit(1) + + elif args.absLayer.upper() == "HIP": + + hipNamePrefix = os.getenv('HIP_NAME_PREFIX') + + if cudaNamePrefix is None: + logging.error('HIP name prefix has not been set!') + sys.exit(1) + + reportfolder= "workspace_mg4gpu/" + datetime.datetime.now().strftime('%y-%m-%d') + '_' + hipNamePrefix + '_' + args.branch + + if not os.path.exists(reportfolder): + logging.error('CUDA report path does not exist!') + sys.exit(1) + + else: + logging.error('No abstraction layer that is supported has been selected!') + sys.exit(1) + + else: + reportfolder = args.reportPath + + filePath = [] + filePath.append(glob.glob(reportfolder + '/test_*.json')) + filePath.append(glob.glob(reportfolder + '/*/test_*.json')) + + # Flatten the list + files = [p for sublist in filePath for p in sublist] + + for file in files: + + with open(file, "r", encoding='utf-8') as f: + + fileContents = f.read() + + if fileContents != '': + data = json.loads(fileContents) + + fileName = (os.path.basename(file)) + + for process in PHYS_PROCESSES: + if process in fileName.lower(): + physicsProcess = process + break + + fileNameParts = fileName.split('_') + + CPU = fileNameParts[4] + + GPU = fileNameParts[5] + + GCCVersion = fileNameParts[6].split('-')[1] + + GPUVersion = fileNameParts[7].split('-')[1] + + gridsize = data[0]["NumThreadsPerBlock"] * data[0]["NumBlocksPerGrid"] + + DBdata = f'{physicsProcess},CPU={CPU},GPU={GPU},AbstractionLayer={args.absLayer},GCCVersion={GCCVersion},GPUVersion={GPUVersion},NumThreadsPerBlock={data[0]["NumThreadsPerBlock"]},NumBlocksPerGrid={data[0]["NumBlocksPerGrid"]},NumIterations={data[0]["NumIterations"]} Gridsize={gridsize}' + + for field in FIELDS: + value = float(re.findall(r'[\d.]+',data[0][field])[0]) + + DBdata = DBdata + ',' + args.absLayer + "_" + field.replace(" ", "_") + '=' + str(value) + + requestInfo = ["curl", "-i", "-k", '-XPOST', "-i", URL, "--header", "Authorization: Token "+AUTH[0]+":"+AUTH[1], "--data-raw", DBdata] + + request = subprocess.run(requestInfo, stdout=subprocess.DEVNULL, check=True) + + f.close() + + if request.returncode != 0: + print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request FAILED! Data: " + DBdata) + else: + print(str(datetime.datetime.now().strftime("%H:%M:%S")) + " Request COMPLETED! Data: " + DBdata) + + + else: logging.error('No information/fields in the JSON report!')