diff --git a/flang/test/Driver/atomic-control-options.f90 b/flang/test/Driver/atomic-control-options.f90 new file mode 100644 index 0000000000000..cb382f96a9d5f --- /dev/null +++ b/flang/test/Driver/atomic-control-options.f90 @@ -0,0 +1,20 @@ +! REQUIRES: amdgpu-registered-target +! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -munsafe-fp-atomics %s -o -|FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s +! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -fatomic-ignore-denormal-mode %s -o -|FileCheck -check-prefix=IGNORE-DENORMAL-MODE %s +! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -fatomic-fine-grained-memory %s -o -|FileCheck -check-prefix=FINE-GRAINED-MEMORY %s +! RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-device -fatomic-remote-memory %s -o -|FileCheck -check-prefix=REMOTE-MEMORY %s +program test + implicit none + integer :: A, threads + threads = 128 + A = 0 + !$omp target parallel num_threads(threads) + !$omp atomic + A = A + 1 + !$omp end target parallel +end program test + +!UNSAFE-FP-ATOMICS: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}} +!IGNORE-DENORMAL-MODE: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}} +!FINE-GRAINED-MEMORY: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.no.remote.memory !{{.*}} +!REMOTE-MEMORY: %{{.*}} = atomicrmw add ptr {{.*}}, i32 1 monotonic, align 4, !amdgpu.no.fine.grained.memory !{{.*}} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 206ad4a4ef85f..b681ea8413726 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -3286,7 +3286,8 @@ class OpenMPIRBuilder { emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, - bool IsXBinopExpr); + bool IsXBinopExpr, bool IsIgnoreDenormalMode, + bool IsFineGrainedMemory, bool IsRemoteMemory); /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 . /// @@ -3359,7 +3360,9 @@ class OpenMPIRBuilder { LLVM_ABI InsertPointOrErrorTy createAtomicUpdate( const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, - AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr); + AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, + bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false, + bool IsRemoteMemory = false); /// Emit atomic update for constructs: --- Only Scalar data types /// V = X; X = X BinOp Expr , @@ -3394,7 +3397,9 @@ class OpenMPIRBuilder { const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, - bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr); + bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, + bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false, + bool IsRemoteMemory = false); /// Emit atomic compare for constructs: --- Only scalar data types /// cond-expr-stmt: diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 3aa4f7ae04c33..260d3c292e56b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -8956,7 +8956,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate( const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, - AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) { + AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, + bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) { assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous"); if (!updateToLocation(Loc)) return Loc.IP; @@ -8974,9 +8975,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate( "OpenMP atomic does not support LT or GT operations"); }); - Expected> AtomicResult = - emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, - X.IsVolatile, IsXBinopExpr); + Expected> AtomicResult = emitAtomicUpdate( + AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile, + IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory); if (!AtomicResult) return AtomicResult.takeError(); checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update); @@ -9023,7 +9024,8 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2, Expected> OpenMPIRBuilder::emitAtomicUpdate( InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, - AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) { + AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr, + bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) { // TODO: handle the case where XElemTy is not byte-sized or not a power of 2 // or a complex datatype. bool emitRMWOp = false; @@ -9046,7 +9048,20 @@ Expected> OpenMPIRBuilder::emitAtomicUpdate( std::pair Res; if (emitRMWOp) { - Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); + AtomicRMWInst *RMWInst = + Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); + if (T.isAMDGPU()) { + if (IsIgnoreDenormalMode) + RMWInst->setMetadata("amdgpu.ignore.denormal.mode", + llvm::MDNode::get(Builder.getContext(), {})); + if (!IsFineGrainedMemory) + RMWInst->setMetadata("amdgpu.no.fine.grained.memory", + llvm::MDNode::get(Builder.getContext(), {})); + if (!IsRemoteMemory) + RMWInst->setMetadata("amdgpu.no.remote.memory", + llvm::MDNode::get(Builder.getContext(), {})); + } + Res.first = RMWInst; // not needed except in case of postfix captures. Generate anyway for // consistency with the else part. Will be removed with any DCE pass. // AtomicRMWInst::Xchg does not have a coressponding instruction. @@ -9178,7 +9193,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, - bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) { + bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, + bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) { if (!updateToLocation(Loc)) return Loc.IP; @@ -9197,9 +9213,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( // If UpdateExpr is 'x' updated with some `expr` not based on 'x', // 'x' is simply atomically rewritten with 'expr'. AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg); - Expected> AtomicResult = - emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, - X.IsVolatile, IsXBinopExpr); + Expected> AtomicResult = emitAtomicUpdate( + AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile, + IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory); if (!AtomicResult) return AtomicResult.takeError(); Value *CapturedVal = diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 762cc88d9fc3d..2cdd502ad0275 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3205,6 +3205,23 @@ llvm::AtomicRMWInst::BinOp convertBinOpToAtomic(Operation &op) { .Default(llvm::AtomicRMWInst::BinOp::BAD_BINOP); } +void extractAtomicControlFlags(omp::AtomicUpdateOp atomicUpdateOp, + bool &isIgnoreDenormalMode, + bool &isFineGrainedMemory, + bool &isRemoteMemory) { + isIgnoreDenormalMode = false; + isFineGrainedMemory = false; + isRemoteMemory = false; + if (atomicUpdateOp && + atomicUpdateOp->hasAttr(atomicUpdateOp.getAtomicControlAttrName())) { + mlir::omp::AtomicControlAttr atomicControlAttr = + atomicUpdateOp.getAtomicControlAttr(); + isIgnoreDenormalMode = atomicControlAttr.getIgnoreDenormalMode(); + isFineGrainedMemory = atomicControlAttr.getFineGrainedMemory(); + isRemoteMemory = atomicControlAttr.getRemoteMemory(); + } +} + /// Converts an OpenMP atomic update operation using OpenMPIRBuilder. static LogicalResult convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst, @@ -3269,13 +3286,19 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst, return moduleTranslation.lookupValue(yieldop.getResults()[0]); }; + bool isIgnoreDenormalMode; + bool isFineGrainedMemory; + bool isRemoteMemory; + extractAtomicControlFlags(opInst, isIgnoreDenormalMode, isFineGrainedMemory, + isRemoteMemory); // Handle ambiguous alloca, if any. auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createAtomicUpdate(ompLoc, allocaIP, llvmAtomicX, llvmExpr, atomicOrdering, binop, updateFn, - isXBinopExpr); + isXBinopExpr, isIgnoreDenormalMode, + isFineGrainedMemory, isRemoteMemory); if (failed(handleError(afterIP, *opInst))) return failure(); @@ -3364,13 +3387,19 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp, return moduleTranslation.lookupValue(yieldop.getResults()[0]); }; + bool isIgnoreDenormalMode; + bool isFineGrainedMemory; + bool isRemoteMemory; + extractAtomicControlFlags(atomicUpdateOp, isIgnoreDenormalMode, + isFineGrainedMemory, isRemoteMemory); // Handle ambiguous alloca, if any. auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createAtomicCapture( ompLoc, allocaIP, llvmAtomicX, llvmAtomicV, llvmExpr, atomicOrdering, - binop, updateFn, atomicUpdateOp, isPostfixUpdate, isXBinopExpr); + binop, updateFn, atomicUpdateOp, isPostfixUpdate, isXBinopExpr, + isIgnoreDenormalMode, isFineGrainedMemory, isRemoteMemory); if (failed(handleError(afterIP, *atomicCaptureOp))) return failure(); diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir new file mode 100644 index 0000000000000..355390719322f --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-atomic-capture-control-options.mlir @@ -0,0 +1,44 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.no.remote.memory !{{.*}} + +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_fine_grained_memory, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp, omp.target_triples = [], omp.version = #omp.version} { + llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget, target_cpu = "generic-hsa"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "capture"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5> + %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr + %9 = llvm.mlir.constant(0 : i32) : i32 + %10 = llvm.mlir.constant(128 : i32) : i32 + %11 = llvm.mlir.constant(1 : i64) : i64 + %12 = llvm.mlir.constant(1 : i64) : i64 + %13 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %10, %2 : i32, !llvm.ptr + llvm.store %9, %8 : i32, !llvm.ptr + %14 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"} + %15 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "capture"} + %16 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"} + omp.target map_entries(%14 -> %arg0, %15 -> %arg1, %16 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %17 = llvm.mlir.constant(1 : i32) : i32 + %18 = llvm.load %arg0 : !llvm.ptr -> i32 + omp.parallel num_threads(%18 : i32) { + omp.atomic.capture { + omp.atomic.read %arg1 = %arg2 : !llvm.ptr, !llvm.ptr, i32 + omp.atomic.update %arg2 : !llvm.ptr { + ^bb0(%arg3: i32): + %19 = llvm.add %arg3, %17 : i32 + omp.yield(%19 : i32) + } {atomic_control = #omp.atomic_control} + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} diff --git a/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir new file mode 100644 index 0000000000000..3b0005bd20798 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-atomic-update-control-options.mlir @@ -0,0 +1,36 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: atomicrmw add ptr %loadgep_, i32 1 monotonic, align 4, !amdgpu.ignore.denormal.mode !{{.*}}, !amdgpu.no.fine.grained.memory !{{.*}}, !amdgpu.no.remote.memory !{{.*}} + +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, fir.atomic_ignore_denormal_mode, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "generic-hsa", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp, omp.target_triples = [], omp.version = #omp.version} { + llvm.func @_QQmain() attributes {fir.bindc_name = "TEST", omp.declare_target = #omp.declaretarget, target_cpu = "generic-hsa"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "threads"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(128 : i32) : i32 + %8 = llvm.mlir.constant(1 : i64) : i64 + %9 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %7, %2 : i32, !llvm.ptr + llvm.store %6, %5 : i32, !llvm.ptr + %10 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "threads"} + %11 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "a"} + omp.target map_entries(%10 -> %arg0, %11 -> %arg1 : !llvm.ptr, !llvm.ptr) { + %12 = llvm.mlir.constant(1 : i32) : i32 + %13 = llvm.load %arg0 : !llvm.ptr -> i32 + omp.parallel num_threads(%13 : i32) { + omp.atomic.update %arg1 : !llvm.ptr { + ^bb0(%arg2: i32): + %14 = llvm.add %arg2, %12 : i32 + omp.yield(%14 : i32) + } {atomic_control = #omp.atomic_control} + omp.terminator + } + omp.terminator + } + llvm.return + } +}