@@ -625,49 +625,41 @@ isbadzero(op, x) = false
625625isgoodzero (:: typeof (max), x) = isbadzero (min, x)
626626isgoodzero (:: typeof (min), x) = isbadzero (max, x)
627627
628- function mapreduce_impl (f, op:: Union{typeof(max), typeof(min)} ,
629- A:: AbstractArrayOrBroadcasted , first :: Int , last :: Int )
630- # 1. This optimization gives different result from general fallback, if the inputs `f.(A)`
631- # contains both 'missing' and 'Nan'.
632- # 2. For Integer cases , general fallback seems faster.
633- # Based the above reasons, only use this for AbstractFloat cases .
634- Eltype = _return_type (i -> f (A[i]) , Tuple{Int})
628+ function mapreduce_impl (f, op:: Union{typeof(max),typeof(min)} ,
629+ A:: AbstractArrayOrBroadcasted , fi :: Int , la :: Int )
630+ @inline elf (i) = @inbounds f (A[i])
631+ # 1. If `f.(A)` contains both 'missing' and 'Nan', this might return `NaN` .
632+ # 2. For Integer input , general fallback is about 2x faster.
633+ # Thus limit this optimization to AbstractFloat .
634+ Eltype = _return_type (elf , Tuple{Int})
635635 Eltype <: AbstractFloat ||
636- return invoke (mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,first,last)
637- a1 = @inbounds A[first]
638- v1 = mapreduce_first (f, op, a1)
639- v2 = v3 = v4 = v1
640- chunk_len = 256
641- start = first + 1
642- simdstop = start + chunk_len - 4
643- while simdstop <= last - 3
644- # short circuit in case of NaN or missing
645- v1 == v1 || return v1
646- v2 == v2 || return v2
647- v3 == v3 || return v3
648- v4 == v4 || return v4
649- @inbounds for i in start: 4 : simdstop
650- v1 = _fast (op, v1, f (A[i+ 0 ]))
651- v2 = _fast (op, v2, f (A[i+ 1 ]))
652- v3 = _fast (op, v3, f (A[i+ 2 ]))
653- v4 = _fast (op, v4, f (A[i+ 3 ]))
636+ return invoke (mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,fi,la)
637+ v1 = v2 = v3 = v4 = elf (fi)
638+ len = (la - fi) >> 2
639+ i = fi
640+ for I in Iterators. partition (1 : len, 64 )
641+ for _ in I
642+ v1 = _fast (op, v1, elf (i+= 1 ))
643+ v2 = _fast (op, v2, elf (i+= 1 ))
644+ v3 = _fast (op, v3, elf (i+= 1 ))
645+ v4 = _fast (op, v4, elf (i+= 1 ))
654646 end
655- checkbounds (A, simdstop+ 3 )
656- start += chunk_len
657- simdstop += chunk_len
647+ # short circuit in case of NaN
648+ isnan (v1) && return v1
649+ isnan (v2) && return v2
650+ isnan (v3) && return v3
651+ isnan (v4) && return v4
658652 end
659653 v = op (op (v1,v2),op (v3,v4))
660- for i in start: last
661- @inbounds ai = A[i]
662- v = op (v, f (ai))
654+ for i in i+ 1 : la
655+ v = op (v, elf (i))
663656 end
664-
665657 # enforce correct order of 0.0 and -0.0
666658 # e.g. maximum([0.0, -0.0]) === 0.0
667659 # should hold
668660 if isbadzero (op, v)
669- for i in first : last
670- x = @inbounds A[i]
661+ for i in fi : la
662+ x = elf (i)
671663 isgoodzero (op,x) && return x
672664 end
673665 end
0 commit comments