File tree Expand file tree Collapse file tree 1 file changed +9
-11
lines changed Expand file tree Collapse file tree 1 file changed +9
-11
lines changed Original file line number Diff line number Diff line change @@ -20,31 +20,29 @@ namespace cp_algo {
2020#endif
2121 }
2222
23- i64x4 lround (dx4 a) {
24- #ifdef __AVX2__
25- return __builtin_convertvector (_mm256_round_pd (a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), i64x4);
26- #else
27- return __builtin_convertvector (a < 0 ? a - 0.5 : a + 0.5 , i64x4);
28- #endif
23+ i64x4 lround (dx4 x) {
24+ // https://stackoverflow.com/a/77376595
25+ static constexpr dx4 magic = dx4 () + double (3ULL << 51 );
26+ return i64x4 (x + magic) - i64x4 (magic);
2927 }
3028
3129 dx4 round (dx4 a) {
3230#ifdef __AVX2__
3331 return _mm256_round_pd (a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
3432#else
35- return __builtin_convertvector (lround (a), Simd );
33+ return __builtin_convertvector (lround (a), dx4 );
3634#endif
3735 }
3836
3937 u64x4 montgomery_reduce (u64x4 x, u64x4 mod, u64x4 imod) {
40- #ifdef __AVX2__
38+ #ifndef __AVX2__
4139 auto x_ninv = _mm256_mul_epu32 (__m256i (x), __m256i (imod));
4240 auto x_res = _mm256_add_epi64 (__m256i (x), _mm256_mul_epu32 (x_ninv, __m256i (mod)));
4341 return u64x4 (_mm256_bsrli_epi128 (x_res, 4 ));
4442#else
45- auto x_ninv = x * imod;
46- auto x_res = x + ((x_ninv << 32 ) >> 32 ) * mod ;
47- return u64x4 (x_res >> 32 ) ;
43+
44+ auto x_ninv = u64x4 ( u32x8 (x) * u32x8 (imod)) ;
45+ return (x + x_ninv * mod) >> 32 ;
4846#endif
4947 }
5048
You can’t perform that action at this time.
0 commit comments