-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[libc++] Vectorize std::find #156431
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc++] Vectorize std::find #156431
Conversation
d4881e1
to
8c5a5dc
Compare
@llvm/pr-subscribers-libcxx Author: Nikolas Klauser (philnik777) Changes
Full diff: https://github.com/llvm/llvm-project/pull/156431.diff 4 Files Affected:
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index cd8171dc5d1e8..f36230697a25c 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -62,6 +62,8 @@ Improvements and New Features
- ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them
in chunks into a buffer.
+- The performance of ``std::find`` has been improved by up to 2x for integral types
+
Deprecations and Removals
-------------------------
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 8c8cb5820fee3..b76216c0a2d7b 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -12,6 +12,7 @@
#include <__algorithm/find_segment_if.h>
#include <__algorithm/min.h>
+#include <__algorithm/simd_utils.h>
#include <__algorithm/unwrap_iter.h>
#include <__bit/countr.h>
#include <__bit/invert_if.h>
@@ -44,39 +45,98 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// generic implementation
template <class _Iter, class _Sent, class _Tp, class _Proj>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
for (; __first != __last; ++__first)
if (std::__invoke(__proj, *__first) == __value)
break;
return __first;
}
-// trivially equality comparable implementations
-template <class _Tp,
- class _Up,
- class _Proj,
- __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
- sizeof(_Tp) == 1,
- int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
- if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
- return __ret;
- return __last;
+template <class _Iter, class _Sent, class _Tp, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+ return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
}
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <class _Tp,
- class _Up,
- class _Proj,
- __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
- sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
- int> = 0>
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+template <class _Tp, class _Up>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
+ constexpr size_t __unroll_count = 4;
+ constexpr size_t __vec_size = __native_vector_size<_Tp>;
+ using __vec = __simd_vector<_Tp, __vec_size>;
+
+ auto __orig_first = __first;
+
+ auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value);
+ while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
+ __vec __lhs[__unroll_count];
+
+ for (size_t __i = 0; __i != __unroll_count; ++__i)
+ __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);
+
+ for (size_t __i = 0; __i != __unroll_count; ++__i) {
+ if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
+ auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
+ return __first + __offset;
+ }
+ }
+
+ __first += __unroll_count * __vec_size;
+ }
+
+ // check the remaining 0-3 vectors
+ while (static_cast<size_t>(__last - __first) >= __vec_size) {
+ if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
+ return __first + std::__find_first_set(__cmp_res);
+ }
+ __first += __vec_size;
+ }
+
+ if (__last - __first == 0)
+ return __first;
+
+ // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+ // (last - vector_size) to check the remaining elements
+ if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
+ __first = __last - __vec_size;
+ return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
+ }
+
+ __identity __proj;
+ return std::__find_loop(__first, __last, __value, __proj);
+}
+#endif
+
+// trivially equality comparable implementations
+template <
+ class _Tp,
+ class _Up,
+ class _Proj,
+ __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
- if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
- return __ret;
- return __last;
+ if constexpr (sizeof(_Tp) == 1) {
+ if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
+ return __ret;
+ return __last;
+ }
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+ else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+ if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
+ return __ret;
+ return __last;
+ }
+#endif
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+ else if constexpr (is_integral<_Tp>::value) {
+ return std::__find_vectorized(__first, __last, __value);
+ }
+#endif
+ else {
+ __identity __proj;
+ return std::__find_loop(__first, __last, __value, __proj);
+ }
}
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
// TODO: This should also be possible to get right with different signedness
// cast integral types to allow vectorization
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 07fef20f6166d..eb5da7f46ecd4 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -108,19 +108,26 @@ using __simd_vector_underlying_type_t _LIBCPP_NODEBUG = decltype(std::__simd_vec
// This isn't inlined without always_inline when loading chars.
template <class _VecT, class _Iter>
-[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
+[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _VecT
+__load_vector(_Iter __iter) noexcept {
return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
return _VecT{__iter[_Indices]...};
}(make_index_sequence<__simd_vector_size_v<_VecT>>{});
}
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+ return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
template <class _Tp, size_t _Np>
[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
}
template <class _Tp, size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_t
+__find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
using __mask_vec = __simd_vector<bool, _Np>;
// This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
index b2ead1cc75585..afea31fb59e95 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
// find
bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
+ bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);
|
@@ -44,39 +45,98 @@ _LIBCPP_BEGIN_NAMESPACE_STD | |||
// generic implementation | |||
template <class _Iter, class _Sent, class _Tp, class _Proj> | |||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter | |||
__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { | |||
__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In an ideal world, the code in __find_loop
as-is would be using __builtin_assume_dereferenceable
and getting vectorized: https://godbolt.org/z/M9xo6PdY1. IIUC this doesn't work in @philnik777 's experience because:
- The function that we want to vectorize can't contain pointer arithmetic, which is always going to happen because
vector.begin()
andvector.end()
perform such arithmetic. - The code is inside a function template, and apparently the optimization doesn't work inside templates?
I'm just summarizing the issues we encountered while trying to make it work with the builtin. CC @fhahn , let's figure out whether we've done something incorrectly here or whether something needs to change in the optimization.
8c5a5dc
to
4b65ea1
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Let's land this and then switch over to using the Clang builtin when it's ready enough.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you update the table in the description to include what platform/HW the benchmarks were run on?
Looking at it (and I might have missed something, would be easier to see if the table would report the % change), we only get speedups for long long
, but not int
?
Correct. We forward to |
4b65ea1
to
6b2ae3a
Compare
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp,h -- libcxx/include/__algorithm/find.h libcxx/include/__algorithm/simd_utils.h libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
View the diff from clang-format here.diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 28ac2db14..5f32ae8fc 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -123,18 +123,18 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _T
return __ret;
return __last;
}
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+# if _LIBCPP_HAS_WIDE_CHARACTERS
else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
return __ret;
return __last;
}
-#endif
-#if _LIBCPP_VECTORIZE_ALGORITHMS
+# endif
+# if _LIBCPP_VECTORIZE_ALGORITHMS
else if constexpr (is_integral<_Tp>::value) {
return std::__find_vectorized(__first, __last, __value);
}
-#endif
+# endif
else {
__identity __proj;
return std::__find_loop(__first, __last, __value, __proj);
|
6b2ae3a
to
854b757
Compare
854b757
to
8bc6c0d
Compare
This breaks compilation with To reproduce: $ clang -target x86_64-w64-mingw32 -flax-vector-conversions=none -include memory -c -x c++ - < /dev/null
In file included from <built-in>:1:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/memory:998:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/iterator:693:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__iterator/istreambuf_iterator.h:19:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:13:
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:80:41: error: cannot convert between vector values of different size ('__vec' (aka '__simd_vector<const char32_t, __vec_size>') and '__simd_vector<char32_t, __vec_size>' (vector of 4 'char32_t' values))
80 | if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
| ~~~~~~~~~~ ^ ~~~~~~~~
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:135:17: note: in instantiation of function template specialization 'std::__find_vectorized<const char32_t, char32_t>' requested here
135 | return std::__find_vectorized(__first, __last, __value);
| ^
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:320:37: note: in instantiation of function template specialization 'std::__find<const char32_t, char32_t, std::__identity, 0>' requested here
320 | const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
| ^ |
That looks a lot like a Clang bug to me. The vector elements have the same size AFAICT. Could you file a bug for that? I'll try to think of a way to work around the problem in the meantime. |
Sorry, this is way too nested for me to extract a reasonable bug report for Clang out of it. I can provide a preprocessed input file triggering the case though, but I presume you can do that yourself as well, with your build of Clang and the recent libc++ headers. |
Or if you mean file a bug about this "recent libc++ fails with Clang with |
``` Apple M4: ----------------------------------------------------------------------------- Benchmark old new ----------------------------------------------------------------------------- std::find(vector<char>) (bail 25%)/8 1.43 ns 1.44 ns std::find(vector<char>) (bail 25%)/1024 5.54 ns 5.59 ns std::find(vector<char>) (bail 25%)/8192 38.4 ns 39.1 ns std::find(vector<char>) (bail 25%)/32768 134 ns 136 ns std::find(vector<int>) (bail 25%)/8 1.56 ns 1.57 ns std::find(vector<int>) (bail 25%)/1024 65.3 ns 65.4 ns std::find(vector<int>) (bail 25%)/8192 465 ns 464 ns std::find(vector<int>) (bail 25%)/32768 1832 ns 1832 ns std::find(vector<long long>) (bail 25%)/8 0.920 ns 1.20 ns std::find(vector<long long>) (bail 25%)/1024 65.2 ns 31.2 ns std::find(vector<long long>) (bail 25%)/8192 464 ns 255 ns std::find(vector<long long>) (bail 25%)/32768 1833 ns 992 ns std::find(vector<char>) (process all)/8 1.21 ns 1.22 ns std::find(vector<char>) (process all)/50 1.92 ns 1.93 ns std::find(vector<char>) (process all)/1024 16.6 ns 16.9 ns std::find(vector<char>) (process all)/8192 134 ns 136 ns std::find(vector<char>) (process all)/32768 488 ns 503 ns std::find(vector<int>) (process all)/8 2.45 ns 2.48 ns std::find(vector<int>) (process all)/50 12.7 ns 12.7 ns std::find(vector<int>) (process all)/1024 236 ns 236 ns std::find(vector<int>) (process all)/8192 1830 ns 1834 ns std::find(vector<int>) (process all)/32768 7351 ns 7346 ns std::find(vector<long long>) (process all)/8 2.02 ns 1.45 ns std::find(vector<long long>) (process all)/50 12.0 ns 6.12 ns std::find(vector<long long>) (process all)/1024 235 ns 123 ns std::find(vector<long long>) (process all)/8192 1830 ns 983 ns std::find(vector<long long>) (process all)/32768 7306 ns 3969 ns std::find(vector<bool>) (process all)/8 1.14 ns 1.15 ns std::find(vector<bool>) (process all)/50 1.16 ns 1.17 ns std::find(vector<bool>) (process all)/1024 4.51 ns 4.53 ns std::find(vector<bool>) (process all)/8192 33.6 ns 33.5 ns std::find(vector<bool>) (process all)/1048576 3660 ns 3660 ns ```
Uh oh!
There was an error while loading. Please reload this page.