-
Notifications
You must be signed in to change notification settings - Fork 14.7k
Description
I found that 535.weather_t in SPEChpc 2021 experiences an 8.9% slowdown on NVIDIA Grace after #125732. This is because one of the loops is no longer vectorized. If OpenMP is disabled, the loop is still vectorized because memcpy
is not called within the loop. (In addition, OpenMP with -mllvm -disable-loop-idiom-memcpy
flag also helps vectorization.)
reproducer: https://godbolt.org/z/5TYohj1d6
Before the patch, the array assignment for stencil
was replaced with memcpy
in LoopIdiomRecognize once, and then both stencil
and memcpy
were removed in SROA. We can remove stencil
here because it is just an alias of in3d(i-2:i+1, k)
. After the patch, the optimization in SROA doesn't work, and memcpy
remains in the loop. It seems that the private variable of stencil
is allocated not on the stack but on the heap, which prevents the optimization.
The following is LLVM IR before SROA.
- Before the patch
define internal void @_QMreproPsub..omp_par(ptr noalias readnone captures(none) %tid.addr, ptr noalias readnone captures(none) %zero.addr, ptr readonly captures(none) %0) #0 {
omp.par.entry:
%loadgep_.reloaded = load ptr, ptr %0, align 8
%gep_.reloaded12 = getelementptr i8, ptr %0, i64 8
%loadgep_.reloaded12 = load ptr, ptr %gep_.reloaded12, align 8
%gep_ = getelementptr i8, ptr %0, i64 16
%loadgep_ = load ptr, ptr %gep_, align 8
%gep_1 = getelementptr i8, ptr %0, i64 24
%loadgep_2 = load ptr, ptr %gep_1, align 8
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i32, align 4
%p.upperbound = alloca i32, align 4
%p.stride = alloca i32, align 4
%1 = load i64, ptr %loadgep_.reloaded, align 8
%2 = load i64, ptr %loadgep_.reloaded12, align 8
%3 = alloca [4 x double], align 8
%4 = alloca [4 x double], align 8 ;; stencil(private)
...
omp.wsloop.region6.preheader: ; preds = %omp_loop.body
%scevgep = getelementptr i8, ptr %loadgep_2, i64 %25
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %4, ptr align 8 %scevgep, i64 32, i1 false), !tbaa !9 ;; stencil(:) = in3d(i-2:i+1, k)
br label %omp.wsloop.region8
- After the patch
define internal void @_QMreproPsub..omp_par(ptr noalias readnone captures(none) %tid.addr, ptr noalias readnone captures(none) %zero.addr, ptr readonly captures(none) %0) #2 {
omp.par.entry:
%loadgep_.reloaded = load ptr, ptr %0, align 8
%gep_.reloaded23 = getelementptr i8, ptr %0, i64 8
%loadgep_.reloaded23 = load ptr, ptr %gep_.reloaded23, align 8
%gep_3 = getelementptr i8, ptr %0, i64 32
%loadgep_4 = load ptr, ptr %gep_3, align 8
%gep_5 = getelementptr i8, ptr %0, i64 40
%loadgep_6 = load ptr, ptr %gep_5, align 8
%p.lastiter = alloca i32, align 4
%p.lowerbound = alloca i32, align 4
%p.upperbound = alloca i32, align 4
%p.stride = alloca i32, align 4
%1 = load i64, ptr %loadgep_.reloaded, align 8
%2 = load i64, ptr %loadgep_.reloaded23, align 8
%3 = load i32, ptr @_QMreproEnx, align 4, !tbaa !3
%4 = tail call dereferenceable_or_null(32) ptr @malloc(i64 32) ;; stencil(private)
...
omp.wsloop.region10.preheader: ; preds = %omp_loop.body
%scevgep = getelementptr i8, ptr %loadgep_6, i64 %25
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %4, ptr align 8 %scevgep, i64 32, i1 false), !tbaa !9 ;; stencil(:) = in3d(i-2:i+1, k)
br label %omp.wsloop.region12
And we can get the following IR by SROA.
- Before the patch
;; only loads in3d(i-2:i+1, k)
omp.wsloop.region6.preheader: ; preds = %omp_loop.body
%scevgep = getelementptr i8, ptr %loadgep_2, i64 %17
%.sroa.0.0.copyload = load double, ptr %scevgep, align 8, !tbaa !9
%.sroa.8.0.scevgep.sroa_idx = getelementptr inbounds i8, ptr %scevgep, i64 8
%.sroa.8.0.copyload = load double, ptr %.sroa.8.0.scevgep.sroa_idx, align 8, !tbaa !9
%.sroa.12.0.scevgep.sroa_idx = getelementptr inbounds i8, ptr %scevgep, i64 16
%.sroa.12.0.copyload = load double, ptr %.sroa.12.0.scevgep.sroa_idx, align 8, !tbaa !9
%.sroa.16.0.scevgep.sroa_idx = getelementptr inbounds i8, ptr %scevgep, i64 24
%.sroa.16.0.copyload = load double, ptr %.sroa.16.0.scevgep.sroa_idx, align 8, !tbaa !9
br label %omp.wsloop.region8
- After the patch
;; no difference
omp.wsloop.region10.preheader: ; preds = %omp_loop.body
%scevgep = getelementptr i8, ptr %loadgep_6, i64 %25
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %4, ptr align 8 %scevgep, i64 32, i1 false), !tbaa !9 ;; stencil(:) = in3d(i-2:i+1, k)
br label %omp.wsloop.region12