-
Notifications
You must be signed in to change notification settings - Fork 15k
[Clang] Add constexpr support for AVX512 permutex2 intrinsics #165085
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This patch enables compile-time evaluation of AVX512 permutex2var intrinsics in constexpr contexts. Extend shuffle generic to handle both integer immediate and vector mask operands. Resolves llvm#161335
|
@llvm/pr-subscribers-backend-x86 Author: NagaChaitanya Vellanki (chaitanyav) ChangesThis patch enables compile-time evaluation of AVX512 permutex2var intrinsics in constexpr contexts. Extend shuffle generic to handle both integer immediate and vector mask operands. Resolves #161335 Patch is 112.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165085.diff 19 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0c85e280e748b..72e67d7dda3bc 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1746,75 +1746,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+ def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+ def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+ def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+ def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+ def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
}
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+ def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+ def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
}
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
}
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
}
-let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
}
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
}
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
}
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d0b97a18e1815..f249a113a95ab 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3358,18 +3358,46 @@ static bool interp__builtin_ia32_shuffle_generic(
GetSourceIndex) {
assert(Call->getNumArgs() == 3);
- unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+
+ unsigned ShuffleMask = 0;
+ Pointer A, MaskVector, B;
+
+ QualType Arg2Type = Call->getArg(2)->getType();
+ bool IsVectorMask = false;
+ if (Arg2Type->isVectorType()) {
+ IsVectorMask = true;
+ B = S.Stk.pop<Pointer>();
+ MaskVector = S.Stk.pop<Pointer>();
+ A = S.Stk.pop<Pointer>();
+ } else if (Arg2Type->isIntegerType()) {
+ ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+ B = S.Stk.pop<Pointer>();
+ A = S.Stk.pop<Pointer>();
+ } else {
+ return false;
+ }
QualType Arg0Type = Call->getArg(0)->getType();
const auto *VecT = Arg0Type->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VecT->getElementType());
unsigned NumElems = VecT->getNumElements();
- const Pointer &B = S.Stk.pop<Pointer>();
- const Pointer &A = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();
+ PrimType MaskElemT = PT_Uint32;
+ if (IsVectorMask) {
+ QualType Arg1Type = Call->getArg(1)->getType();
+ const auto *MaskVecT = Arg1Type->castAs<VectorType>();
+ QualType MaskElemType = MaskVecT->getElementType();
+ MaskElemT = *S.getContext().classify(MaskElemType);
+ }
+
for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+ if (IsVectorMask) {
+ INT_TYPE_SWITCH(MaskElemT, {
+ ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
+ });
+ }
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
@@ -4345,6 +4373,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
});
+ case X86::BI__builtin_ia32_vpermi2varq128:
+ case X86::BI__builtin_ia32_vpermi2varpd128:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1;
+ unsigned SrcIdx = (ShuffleMask >> 1) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2vard128:
+ case X86::BI__builtin_ia32_vpermi2varps128:
+ case X86::BI__builtin_ia32_vpermi2varq256:
+ case X86::BI__builtin_ia32_vpermi2varpd256:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3;
+ unsigned SrcIdx = (ShuffleMask >> 2) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varhi128:
+ case X86::BI__builtin_ia32_vpermi2vard256:
+ case X86::BI__builtin_ia32_vpermi2varps256:
+ case X86::BI__builtin_ia32_vpermi2varq512:
+ case X86::BI__builtin_ia32_vpermi2varpd512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x7;
+ unsigned SrcIdx = (ShuffleMask >> 3) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varqi128:
+ case X86::BI__builtin_ia32_vpermi2varhi256:
+ case X86::BI__builtin_ia32_vpermi2vard512:
+ case X86::BI__builtin_ia32_vpermi2varps512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0xF;
+ unsigned SrcIdx = (ShuffleMask >> 4) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varqi256:
+ case X86::BI__builtin_ia32_vpermi2varhi512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1F;
+ unsigned SrcIdx = (ShuffleMask >> 5) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varqi512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3F;
+ unsigned SrcIdx = (ShuffleMask >> 6) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29ee089505125..1427005b9bd79 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric(
if (!VT)
return false;
- APSInt MaskImm;
- if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
- return false;
- unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+ unsigned ShuffleMask = 0;
+ APValue A, MaskVector, B;
+ bool IsVectorMask = false;
- APValue A, B;
- if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
- !EvaluateAsRValue(Info, Call->getArg(1), B))
+ QualType Arg2Type = Call->getArg(2)->getType();
+ if (Arg2Type->isVectorType()) {
+ IsVectorMask = true;
+ if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+ !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
+ !EvaluateAsRValue(Info, Call->getArg(2), B))
+ return false;
+ } else if (Arg2Type->isIntegerType()) {
+ APSInt MaskImm;
+ if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+ return false;
+ ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+ if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+ !EvaluateAsRValue(Info, Call->getArg(1), B))
+ return false;
+ } else {
return false;
+ }
unsigned NumElts = VT->getNumElements();
- SmallVector<APValue, 16> ResultElements;
+ SmallVector<APValue, 64> ResultElements;
ResultElements.reserve(NumElts);
for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
+ if (IsVectorMask) {
+ ShuffleMask = static_cast<unsigned>(
+ MaskVector.getVectorElt(DstIdx).getInt().getZExtValue());
+ }
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));
@@ -13048,6 +13065,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_vpermi2varq128:
+ case X86::BI__builtin_ia32_vpermi2varpd128: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1;
+ unsigned SrcIdx = (ShuffleMask >> 1) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2vard128:
+ case X86::BI__builtin_ia32_vpermi2varps128:
+ case X86::BI__builtin_ia32_vpermi2varq256:
+ case X86::BI__builtin_ia32_vpermi2varpd256: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3;
+ unsigned SrcIdx = (ShuffleMask >> 2) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varhi128:
+ case X86::BI__builtin_ia32_vpermi2vard256:
+ case X86::BI__builtin_ia32_vpermi2varps256:
+ case X86::BI__builtin_ia32_vpermi2varq512:
+ case X86::BI__builtin_ia32_vpermi2varpd512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x7;
+ unsigned SrcIdx = (ShuffleMask >> 3) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varqi128:
+ case X86::BI__builtin_ia32_vpermi2varhi256:
+ case X86::BI__builtin_ia32_vpermi2vard512:
+ case X86::BI__builtin_ia32_vpermi2varps512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0xF;
+ unsigned SrcIdx = (ShuffleMask >> 4) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varqi256:
+ case X86::BI__builtin_ia32_vpermi2varhi512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1F;
+ unsigned SrcIdx = (ShuffleMask >> 5) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varqi512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3F;
+ unsigned SrcIdx = (ShuffleMask >> 6) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
}
}
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 37ebc4f46a826..46ec12a63ef9c 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
__min_vector_width__(512)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#endif
+
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
}
@@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
(__v32bf)__A);
}
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
(__v32hi)__B);
@@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
(__v32bf)_mm512_setzero_pbh());
}
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
#undef __DEFAULT_FN_ATTRS512
#endif
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 765cd682986b4..8fb8cd7cd0865 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
__min_vector_width__(128)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
}
@@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
(__v16bf)__A);
}
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
(__v8hi)__B);
}
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
(__v16hi)__B);
@@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#endif
#endif
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index ac75b6ccde735..aab1f2b61ab8a 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, _...
[truncated]
|
|
@llvm/pr-subscribers-clang Author: NagaChaitanya Vellanki (chaitanyav) ChangesThis patch enables compile-time evaluation of AVX512 permutex2var intrinsics in constexpr contexts. Extend shuffle generic to handle both integer immediate and vector mask operands. Resolves #161335 Patch is 112.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165085.diff 19 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0c85e280e748b..72e67d7dda3bc 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1746,75 +1746,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+ def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+ def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+ def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+ def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+ def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
}
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+ def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+ def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
}
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
}
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
}
-let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
}
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
}
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
}
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d0b97a18e1815..f249a113a95ab 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3358,18 +3358,46 @@ static bool interp__builtin_ia32_shuffle_generic(
GetSourceIndex) {
assert(Call->getNumArgs() == 3);
- unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+
+ unsigned ShuffleMask = 0;
+ Pointer A, MaskVector, B;
+
+ QualType Arg2Type = Call->getArg(2)->getType();
+ bool IsVectorMask = false;
+ if (Arg2Type->isVectorType()) {
+ IsVectorMask = true;
+ B = S.Stk.pop<Pointer>();
+ MaskVector = S.Stk.pop<Pointer>();
+ A = S.Stk.pop<Pointer>();
+ } else if (Arg2Type->isIntegerType()) {
+ ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+ B = S.Stk.pop<Pointer>();
+ A = S.Stk.pop<Pointer>();
+ } else {
+ return false;
+ }
QualType Arg0Type = Call->getArg(0)->getType();
const auto *VecT = Arg0Type->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VecT->getElementType());
unsigned NumElems = VecT->getNumElements();
- const Pointer &B = S.Stk.pop<Pointer>();
- const Pointer &A = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();
+ PrimType MaskElemT = PT_Uint32;
+ if (IsVectorMask) {
+ QualType Arg1Type = Call->getArg(1)->getType();
+ const auto *MaskVecT = Arg1Type->castAs<VectorType>();
+ QualType MaskElemType = MaskVecT->getElementType();
+ MaskElemT = *S.getContext().classify(MaskElemType);
+ }
+
for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+ if (IsVectorMask) {
+ INT_TYPE_SWITCH(MaskElemT, {
+ ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
+ });
+ }
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
@@ -4345,6 +4373,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
});
+ case X86::BI__builtin_ia32_vpermi2varq128:
+ case X86::BI__builtin_ia32_vpermi2varpd128:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1;
+ unsigned SrcIdx = (ShuffleMask >> 1) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2vard128:
+ case X86::BI__builtin_ia32_vpermi2varps128:
+ case X86::BI__builtin_ia32_vpermi2varq256:
+ case X86::BI__builtin_ia32_vpermi2varpd256:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3;
+ unsigned SrcIdx = (ShuffleMask >> 2) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varhi128:
+ case X86::BI__builtin_ia32_vpermi2vard256:
+ case X86::BI__builtin_ia32_vpermi2varps256:
+ case X86::BI__builtin_ia32_vpermi2varq512:
+ case X86::BI__builtin_ia32_vpermi2varpd512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x7;
+ unsigned SrcIdx = (ShuffleMask >> 3) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varqi128:
+ case X86::BI__builtin_ia32_vpermi2varhi256:
+ case X86::BI__builtin_ia32_vpermi2vard512:
+ case X86::BI__builtin_ia32_vpermi2varps512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0xF;
+ unsigned SrcIdx = (ShuffleMask >> 4) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varqi256:
+ case X86::BI__builtin_ia32_vpermi2varhi512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1F;
+ unsigned SrcIdx = (ShuffleMask >> 5) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
+ case X86::BI__builtin_ia32_vpermi2varqi512:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3F;
+ unsigned SrcIdx = (ShuffleMask >> 6) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ });
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29ee089505125..1427005b9bd79 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric(
if (!VT)
return false;
- APSInt MaskImm;
- if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
- return false;
- unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+ unsigned ShuffleMask = 0;
+ APValue A, MaskVector, B;
+ bool IsVectorMask = false;
- APValue A, B;
- if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
- !EvaluateAsRValue(Info, Call->getArg(1), B))
+ QualType Arg2Type = Call->getArg(2)->getType();
+ if (Arg2Type->isVectorType()) {
+ IsVectorMask = true;
+ if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+ !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
+ !EvaluateAsRValue(Info, Call->getArg(2), B))
+ return false;
+ } else if (Arg2Type->isIntegerType()) {
+ APSInt MaskImm;
+ if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+ return false;
+ ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+ if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+ !EvaluateAsRValue(Info, Call->getArg(1), B))
+ return false;
+ } else {
return false;
+ }
unsigned NumElts = VT->getNumElements();
- SmallVector<APValue, 16> ResultElements;
+ SmallVector<APValue, 64> ResultElements;
ResultElements.reserve(NumElts);
for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
+ if (IsVectorMask) {
+ ShuffleMask = static_cast<unsigned>(
+ MaskVector.getVectorElt(DstIdx).getInt().getZExtValue());
+ }
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));
@@ -13048,6 +13065,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_vpermi2varq128:
+ case X86::BI__builtin_ia32_vpermi2varpd128: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1;
+ unsigned SrcIdx = (ShuffleMask >> 1) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2vard128:
+ case X86::BI__builtin_ia32_vpermi2varps128:
+ case X86::BI__builtin_ia32_vpermi2varq256:
+ case X86::BI__builtin_ia32_vpermi2varpd256: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3;
+ unsigned SrcIdx = (ShuffleMask >> 2) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varhi128:
+ case X86::BI__builtin_ia32_vpermi2vard256:
+ case X86::BI__builtin_ia32_vpermi2varps256:
+ case X86::BI__builtin_ia32_vpermi2varq512:
+ case X86::BI__builtin_ia32_vpermi2varpd512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x7;
+ unsigned SrcIdx = (ShuffleMask >> 3) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varqi128:
+ case X86::BI__builtin_ia32_vpermi2varhi256:
+ case X86::BI__builtin_ia32_vpermi2vard512:
+ case X86::BI__builtin_ia32_vpermi2varps512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0xF;
+ unsigned SrcIdx = (ShuffleMask >> 4) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varqi256:
+ case X86::BI__builtin_ia32_vpermi2varhi512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x1F;
+ unsigned SrcIdx = (ShuffleMask >> 5) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
+ case X86::BI__builtin_ia32_vpermi2varqi512: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) {
+ unsigned offset = ShuffleMask & 0x3F;
+ unsigned SrcIdx = (ShuffleMask >> 6) & 0x1 ? 1 : 0;
+ return std::pair<unsigned, unsigned>{SrcIdx, offset};
+ }))
+ return false;
+ return Success(R, E);
+ }
}
}
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 37ebc4f46a826..46ec12a63ef9c 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
__min_vector_width__(512)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#endif
+
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
}
@@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
(__v32bf)__A);
}
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
(__v32hi)__B);
@@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
(__v32bf)_mm512_setzero_pbh());
}
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
#undef __DEFAULT_FN_ATTRS512
#endif
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 765cd682986b4..8fb8cd7cd0865 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
__min_vector_width__(128)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
}
@@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
(__v16bf)__A);
}
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
(__v8hi)__B);
}
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
(__v16hi)__B);
@@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#endif
#endif
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index ac75b6ccde735..aab1f2b61ab8a 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, _...
[truncated]
|
This patch enables compile-time evaluation of AVX512 permutex2var intrinsics in constexpr contexts.
Extend shuffle generic to handle both integer immediate and vector mask operands.
Resolves #161335