Skip to content

Conversation

philnik777
Copy link
Contributor

@philnik777 philnik777 commented Sep 2, 2025

Apple M4:
-----------------------------------------------------------------------------
Benchmark                                                 old             new
-----------------------------------------------------------------------------
std::find(vector<char>) (bail 25%)/8                  1.43 ns         1.44 ns
std::find(vector<char>) (bail 25%)/1024               5.54 ns         5.59 ns
std::find(vector<char>) (bail 25%)/8192               38.4 ns         39.1 ns
std::find(vector<char>) (bail 25%)/32768               134 ns          136 ns
std::find(vector<int>) (bail 25%)/8                   1.56 ns         1.57 ns
std::find(vector<int>) (bail 25%)/1024                65.3 ns         65.4 ns
std::find(vector<int>) (bail 25%)/8192                 465 ns          464 ns
std::find(vector<int>) (bail 25%)/32768               1832 ns         1832 ns
std::find(vector<long long>) (bail 25%)/8            0.920 ns         1.20 ns
std::find(vector<long long>) (bail 25%)/1024          65.2 ns         31.2 ns
std::find(vector<long long>) (bail 25%)/8192           464 ns          255 ns
std::find(vector<long long>) (bail 25%)/32768         1833 ns          992 ns
std::find(vector<char>) (process all)/8               1.21 ns         1.22 ns
std::find(vector<char>) (process all)/50              1.92 ns         1.93 ns
std::find(vector<char>) (process all)/1024            16.6 ns         16.9 ns
std::find(vector<char>) (process all)/8192             134 ns          136 ns
std::find(vector<char>) (process all)/32768            488 ns          503 ns
std::find(vector<int>) (process all)/8                2.45 ns         2.48 ns
std::find(vector<int>) (process all)/50               12.7 ns         12.7 ns
std::find(vector<int>) (process all)/1024              236 ns          236 ns
std::find(vector<int>) (process all)/8192             1830 ns         1834 ns
std::find(vector<int>) (process all)/32768            7351 ns         7346 ns
std::find(vector<long long>) (process all)/8          2.02 ns         1.45 ns
std::find(vector<long long>) (process all)/50         12.0 ns         6.12 ns
std::find(vector<long long>) (process all)/1024        235 ns          123 ns
std::find(vector<long long>) (process all)/8192       1830 ns          983 ns
std::find(vector<long long>) (process all)/32768      7306 ns         3969 ns
std::find(vector<bool>) (process all)/8               1.14 ns         1.15 ns
std::find(vector<bool>) (process all)/50              1.16 ns         1.17 ns
std::find(vector<bool>) (process all)/1024            4.51 ns         4.53 ns
std::find(vector<bool>) (process all)/8192            33.6 ns         33.5 ns
std::find(vector<bool>) (process all)/1048576         3660 ns         3660 ns

@ldionne ldionne marked this pull request as ready for review September 15, 2025 15:30
@ldionne ldionne requested a review from a team as a code owner September 15, 2025 15:30
@llvmbot llvmbot added the libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. label Sep 15, 2025
@llvmbot
Copy link
Member

llvmbot commented Sep 15, 2025

@llvm/pr-subscribers-libcxx

Author: Nikolas Klauser (philnik777)

Changes
-----------------------------------------------------------------------------
Benchmark                                                 old             new
-----------------------------------------------------------------------------
std::find(vector&lt;char&gt;) (bail 25%)/8                  1.43 ns         1.44 ns
std::find(vector&lt;char&gt;) (bail 25%)/1024               5.54 ns         5.59 ns
std::find(vector&lt;char&gt;) (bail 25%)/8192               38.4 ns         39.1 ns
std::find(vector&lt;char&gt;) (bail 25%)/32768               134 ns          136 ns
std::find(vector&lt;int&gt;) (bail 25%)/8                   1.56 ns         1.57 ns
std::find(vector&lt;int&gt;) (bail 25%)/1024                65.3 ns         65.4 ns
std::find(vector&lt;int&gt;) (bail 25%)/8192                 465 ns          464 ns
std::find(vector&lt;int&gt;) (bail 25%)/32768               1832 ns         1832 ns
std::find(vector&lt;long long&gt;) (bail 25%)/8            0.920 ns         1.20 ns
std::find(vector&lt;long long&gt;) (bail 25%)/1024          65.2 ns         31.2 ns
std::find(vector&lt;long long&gt;) (bail 25%)/8192           464 ns          255 ns
std::find(vector&lt;long long&gt;) (bail 25%)/32768         1833 ns          992 ns
std::find(vector&lt;char&gt;) (process all)/8               1.21 ns         1.22 ns
std::find(vector&lt;char&gt;) (process all)/50              1.92 ns         1.93 ns
std::find(vector&lt;char&gt;) (process all)/1024            16.6 ns         16.9 ns
std::find(vector&lt;char&gt;) (process all)/8192             134 ns          136 ns
std::find(vector&lt;char&gt;) (process all)/32768            488 ns          503 ns
std::find(vector&lt;int&gt;) (process all)/8                2.45 ns         2.48 ns
std::find(vector&lt;int&gt;) (process all)/50               12.7 ns         12.7 ns
std::find(vector&lt;int&gt;) (process all)/1024              236 ns          236 ns
std::find(vector&lt;int&gt;) (process all)/8192             1830 ns         1834 ns
std::find(vector&lt;int&gt;) (process all)/32768            7351 ns         7346 ns
std::find(vector&lt;long long&gt;) (process all)/8          2.02 ns         1.45 ns
std::find(vector&lt;long long&gt;) (process all)/50         12.0 ns         6.12 ns
std::find(vector&lt;long long&gt;) (process all)/1024        235 ns          123 ns
std::find(vector&lt;long long&gt;) (process all)/8192       1830 ns          983 ns
std::find(vector&lt;long long&gt;) (process all)/32768      7306 ns         3969 ns
std::find(vector&lt;bool&gt;) (process all)/8               1.14 ns         1.15 ns
std::find(vector&lt;bool&gt;) (process all)/50              1.16 ns         1.17 ns
std::find(vector&lt;bool&gt;) (process all)/1024            4.51 ns         4.53 ns
std::find(vector&lt;bool&gt;) (process all)/8192            33.6 ns         33.5 ns
std::find(vector&lt;bool&gt;) (process all)/1048576         3660 ns         3660 ns

Full diff: https://github.com/llvm/llvm-project/pull/156431.diff

4 Files Affected:

  • (modified) libcxx/docs/ReleaseNotes/22.rst (+2)
  • (modified) libcxx/include/__algorithm/find.h (+83-23)
  • (modified) libcxx/include/__algorithm/simd_utils.h (+9-2)
  • (modified) libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp (+1)
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index cd8171dc5d1e8..f36230697a25c 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -62,6 +62,8 @@ Improvements and New Features
 - ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them
   in chunks into a buffer.
 
+- The performance of ``std::find`` has been improved by up to 2x for integral types
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 8c8cb5820fee3..b76216c0a2d7b 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -12,6 +12,7 @@
 
 #include <__algorithm/find_segment_if.h>
 #include <__algorithm/min.h>
+#include <__algorithm/simd_utils.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
@@ -44,39 +45,98 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
   for (; __first != __last; ++__first)
     if (std::__invoke(__proj, *__first) == __value)
       break;
   return __first;
 }
 
-// trivially equality comparable implementations
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == 1,
-                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+template <class _Iter, class _Sent, class _Tp, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+  return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
-                        int> = 0>
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+template <class _Tp, class _Up>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
+  constexpr size_t __unroll_count = 4;
+  constexpr size_t __vec_size     = __native_vector_size<_Tp>;
+  using __vec                     = __simd_vector<_Tp, __vec_size>;
+
+  auto __orig_first = __first;
+
+  auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value);
+  while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
+    __vec __lhs[__unroll_count];
+
+    for (size_t __i = 0; __i != __unroll_count; ++__i)
+      __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);
+
+    for (size_t __i = 0; __i != __unroll_count; ++__i) {
+      if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
+        auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
+        return __first + __offset;
+      }
+    }
+
+    __first += __unroll_count * __vec_size;
+  }
+
+  // check the remaining 0-3 vectors
+  while (static_cast<size_t>(__last - __first) >= __vec_size) {
+    if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
+      return __first + std::__find_first_set(__cmp_res);
+    }
+    __first += __vec_size;
+  }
+
+  if (__last - __first == 0)
+    return __first;
+
+  // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+  // (last - vector_size) to check the remaining elements
+  if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
+    __first = __last - __vec_size;
+    return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
+  }
+
+  __identity __proj;
+  return std::__find_loop(__first, __last, __value, __proj);
+}
+#endif
+
+// trivially equality comparable implementations
+template <
+    class _Tp,
+    class _Up,
+    class _Proj,
+    __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+  if constexpr (sizeof(_Tp) == 1) {
+    if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+  else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+    if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#endif
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+  else if constexpr (is_integral<_Tp>::value) {
+    return std::__find_vectorized(__first, __last, __value);
+  }
+#endif
+  else {
+    __identity __proj;
+    return std::__find_loop(__first, __last, __value, __proj);
+  }
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 // TODO: This should also be possible to get right with different signedness
 // cast integral types to allow vectorization
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 07fef20f6166d..eb5da7f46ecd4 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -108,19 +108,26 @@ using __simd_vector_underlying_type_t _LIBCPP_NODEBUG = decltype(std::__simd_vec
 
 // This isn't inlined without always_inline when loading chars.
 template <class _VecT, class _Iter>
-[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
+[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _VecT
+__load_vector(_Iter __iter) noexcept {
   return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
     return _VecT{__iter[_Indices]...};
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+  return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
 template <class _Tp, size_t _Np>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
   return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
 }
 
 template <class _Tp, size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_t
+__find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
   using __mask_vec = __simd_vector<bool, _Np>;
 
   // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
index b2ead1cc75585..afea31fb59e95 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
     // find
     bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
     bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
+    bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
     bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
     bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);
 

@@ -44,39 +45,98 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// generic implementation
template <class _Iter, class _Sent, class _Tp, class _Proj>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In an ideal world, the code in __find_loop as-is would be using __builtin_assume_dereferenceable and getting vectorized: https://godbolt.org/z/M9xo6PdY1. IIUC this doesn't work in @philnik777 's experience because:

  1. The function that we want to vectorize can't contain pointer arithmetic, which is always going to happen because vector.begin() and vector.end() perform such arithmetic.
  2. The code is inside a function template, and apparently the optimization doesn't work inside templates?

I'm just summarizing the issues we encountered while trying to make it work with the builtin. CC @fhahn , let's figure out whether we've done something incorrectly here or whether something needs to change in the optimization.

Copy link
Member

@ldionne ldionne left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Let's land this and then switch over to using the Clang builtin when it's ready enough.

Copy link
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you update the table in the description to include what platform/HW the benchmarks were run on?

Looking at it (and I might have missed something, would be easier to see if the table would report the % change), we only get speedups for long long, but not int?

@philnik777
Copy link
Contributor Author

Looking at it (and I might have missed something, would be easier to see if the table would report the % change), we only get speedups for long long, but not int?

Correct. We forward to wmemchr with int, but unfortunately that's currently quite slow on macOS. We plan to disable that "optimization" on macOS in a follow-up to get the vectorization for 4 byte integral types as well.

Copy link

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff origin/main HEAD --extensions cpp,h -- libcxx/include/__algorithm/find.h libcxx/include/__algorithm/simd_utils.h libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp

⚠️
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing origin/main to the base branch/commit you want to compare against.
⚠️

View the diff from clang-format here.
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 28ac2db14..5f32ae8fc 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -123,18 +123,18 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _T
       return __ret;
     return __last;
   }
-#if _LIBCPP_HAS_WIDE_CHARACTERS
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
   else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
     if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
       return __ret;
     return __last;
   }
-#endif
-#if _LIBCPP_VECTORIZE_ALGORITHMS
+#  endif
+#  if _LIBCPP_VECTORIZE_ALGORITHMS
   else if constexpr (is_integral<_Tp>::value) {
     return std::__find_vectorized(__first, __last, __value);
   }
-#endif
+#  endif
   else {
     __identity __proj;
     return std::__find_loop(__first, __last, __value, __proj);

@philnik777 philnik777 merged commit 97367d1 into llvm:main Sep 29, 2025
12 of 16 checks passed
@philnik777 philnik777 deleted the optimize_find branch September 29, 2025 09:10
@mstorsjo
Copy link
Member

This breaks compilation with -flax-vector-conversions=none, which projects like libaom uses e.g. here: https://aomedia.googlesource.com/aom/+/refs/tags/v3.13.1/build/cmake/aom_configure.cmake#399

To reproduce:

$ clang -target x86_64-w64-mingw32 -flax-vector-conversions=none -include memory -c -x c++ - < /dev/null
In file included from <built-in>:1:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/memory:998:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/iterator:693:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__iterator/istreambuf_iterator.h:19:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:13:
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:80:41: error: cannot convert between vector values of different size ('__vec' (aka '__simd_vector<const char32_t, __vec_size>') and '__simd_vector<char32_t, __vec_size>' (vector of 4 'char32_t' values))
   80 |         if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
      |                              ~~~~~~~~~~ ^  ~~~~~~~~
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:135:17: note: in instantiation of function template specialization 'std::__find_vectorized<const char32_t, char32_t>' requested here
  135 |     return std::__find_vectorized(__first, __last, __value);
      |                 ^
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:320:37: note: in instantiation of function template specialization 'std::__find<const char32_t, char32_t, std::__identity, 0>' requested here
  320 |     const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
      |                                     ^

@philnik777
Copy link
Contributor Author

This breaks compilation with -flax-vector-conversions=none, which projects like libaom uses e.g. here: https://aomedia.googlesource.com/aom/+/refs/tags/v3.13.1/build/cmake/aom_configure.cmake#399

To reproduce:

$ clang -target x86_64-w64-mingw32 -flax-vector-conversions=none -include memory -c -x c++ - < /dev/null
In file included from <built-in>:1:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/memory:998:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/iterator:693:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__iterator/istreambuf_iterator.h:19:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:13:
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:80:41: error: cannot convert between vector values of different size ('__vec' (aka '__simd_vector<const char32_t, __vec_size>') and '__simd_vector<char32_t, __vec_size>' (vector of 4 'char32_t' values))
   80 |         if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
      |                              ~~~~~~~~~~ ^  ~~~~~~~~
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:135:17: note: in instantiation of function template specialization 'std::__find_vectorized<const char32_t, char32_t>' requested here
  135 |     return std::__find_vectorized(__first, __last, __value);
      |                 ^
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:320:37: note: in instantiation of function template specialization 'std::__find<const char32_t, char32_t, std::__identity, 0>' requested here
  320 |     const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
      |                                     ^

That looks a lot like a Clang bug to me. The vector elements have the same size AFAICT. Could you file a bug for that? I'll try to think of a way to work around the problem in the meantime.

@mstorsjo
Copy link
Member

This breaks compilation with -flax-vector-conversions=none, which projects like libaom uses e.g. here: https://aomedia.googlesource.com/aom/+/refs/tags/v3.13.1/build/cmake/aom_configure.cmake#399
To reproduce:

$ clang -target x86_64-w64-mingw32 -flax-vector-conversions=none -include memory -c -x c++ - < /dev/null
In file included from <built-in>:1:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/memory:998:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/iterator:693:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__iterator/istreambuf_iterator.h:19:
In file included from /home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:13:
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:80:41: error: cannot convert between vector values of different size ('__vec' (aka '__simd_vector<const char32_t, __vec_size>') and '__simd_vector<char32_t, __vec_size>' (vector of 4 'char32_t' values))
   80 |         if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
      |                              ~~~~~~~~~~ ^  ~~~~~~~~
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__algorithm/find.h:135:17: note: in instantiation of function template specialization 'std::__find_vectorized<const char32_t, char32_t>' requested here
  135 |     return std::__find_vectorized(__first, __last, __value);
      |                 ^
/home/martin/clang-nightly-tue/x86_64-w64-mingw32/include/c++/v1/__string/char_traits.h:320:37: note: in instantiation of function template specialization 'std::__find<const char32_t, char32_t, std::__identity, 0>' requested here
  320 |     const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
      |                                     ^

That looks a lot like a Clang bug to me. The vector elements have the same size AFAICT. Could you file a bug for that? I'll try to think of a way to work around the problem in the meantime.

Sorry, this is way too nested for me to extract a reasonable bug report for Clang out of it. I can provide a preprocessed input file triggering the case though, but I presume you can do that yourself as well, with your build of Clang and the recent libc++ headers.

@mstorsjo
Copy link
Member

Or if you mean file a bug about this "recent libc++ fails with Clang with -flax-vector-conversions=none", then sure I can do that, essentially just what I posted above - so you then can pass that bug around between libc++ and Clang while figuring out who's to blame. :-)

mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Oct 3, 2025
```
Apple M4:
-----------------------------------------------------------------------------
Benchmark                                                 old             new
-----------------------------------------------------------------------------
std::find(vector<char>) (bail 25%)/8                  1.43 ns         1.44 ns
std::find(vector<char>) (bail 25%)/1024               5.54 ns         5.59 ns
std::find(vector<char>) (bail 25%)/8192               38.4 ns         39.1 ns
std::find(vector<char>) (bail 25%)/32768               134 ns          136 ns
std::find(vector<int>) (bail 25%)/8                   1.56 ns         1.57 ns
std::find(vector<int>) (bail 25%)/1024                65.3 ns         65.4 ns
std::find(vector<int>) (bail 25%)/8192                 465 ns          464 ns
std::find(vector<int>) (bail 25%)/32768               1832 ns         1832 ns
std::find(vector<long long>) (bail 25%)/8            0.920 ns         1.20 ns
std::find(vector<long long>) (bail 25%)/1024          65.2 ns         31.2 ns
std::find(vector<long long>) (bail 25%)/8192           464 ns          255 ns
std::find(vector<long long>) (bail 25%)/32768         1833 ns          992 ns
std::find(vector<char>) (process all)/8               1.21 ns         1.22 ns
std::find(vector<char>) (process all)/50              1.92 ns         1.93 ns
std::find(vector<char>) (process all)/1024            16.6 ns         16.9 ns
std::find(vector<char>) (process all)/8192             134 ns          136 ns
std::find(vector<char>) (process all)/32768            488 ns          503 ns
std::find(vector<int>) (process all)/8                2.45 ns         2.48 ns
std::find(vector<int>) (process all)/50               12.7 ns         12.7 ns
std::find(vector<int>) (process all)/1024              236 ns          236 ns
std::find(vector<int>) (process all)/8192             1830 ns         1834 ns
std::find(vector<int>) (process all)/32768            7351 ns         7346 ns
std::find(vector<long long>) (process all)/8          2.02 ns         1.45 ns
std::find(vector<long long>) (process all)/50         12.0 ns         6.12 ns
std::find(vector<long long>) (process all)/1024        235 ns          123 ns
std::find(vector<long long>) (process all)/8192       1830 ns          983 ns
std::find(vector<long long>) (process all)/32768      7306 ns         3969 ns
std::find(vector<bool>) (process all)/8               1.14 ns         1.15 ns
std::find(vector<bool>) (process all)/50              1.16 ns         1.17 ns
std::find(vector<bool>) (process all)/1024            4.51 ns         4.53 ns
std::find(vector<bool>) (process all)/8192            33.6 ns         33.5 ns
std::find(vector<bool>) (process all)/1048576         3660 ns         3660 ns
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants