@@ -31,7 +31,7 @@ macro_rules! wmul_impl {
3131 } ;
3232
3333 // simd bulk implementation
34- ( $( ( $ty: ident, $wide: ident ) , ) +, $shift: expr) => {
34+ ( $( ( $ty: ident, $wide: ty ) , ) +, $shift: expr) => {
3535 $(
3636 impl WideningMultiply for $ty {
3737 type Output = ( $ty, $ty) ;
@@ -152,7 +152,8 @@ mod simd_wmul {
152152 ( u8x4, u16x4) ,
153153 ( u8x8, u16x8) ,
154154 ( u8x16, u16x16) ,
155- ( u8x32, u16x32) , ,
155+ ( u8x32, u16x32) ,
156+ ( u8x64, Simd <u16 , 64 >) , ,
156157 8
157158 }
158159
@@ -162,6 +163,8 @@ mod simd_wmul {
162163 wmul_impl ! { ( u16x8, u32x8) , , 16 }
163164 #[ cfg( not( target_feature = "avx2" ) ) ]
164165 wmul_impl ! { ( u16x16, u32x16) , , 16 }
166+ #[ cfg( not( target_feature = "avx512bw" ) ) ]
167+ wmul_impl ! { ( u16x32, Simd <u32 , 32 >) , , 16 }
165168
166169 // 16-bit lane widths allow use of the x86 `mulhi` instructions, which
167170 // means `wmul` can be implemented with only two instructions.
@@ -191,15 +194,11 @@ mod simd_wmul {
191194 wmul_impl ! {
192195 ( u32x2, u64x2) ,
193196 ( u32x4, u64x4) ,
194- ( u32x8, u64x8) , ,
197+ ( u32x8, u64x8) ,
198+ ( u32x16, Simd <u64 , 16 >) , ,
195199 32
196200 }
197201
198- // TODO: optimize, this seems to seriously slow things down
199- wmul_impl_large ! { ( u8x64, ) u8 , 4 }
200- #[ cfg( not( target_feature = "avx512bw" ) ) ]
201- wmul_impl_large ! { ( u16x32, ) u16 , 8 }
202- wmul_impl_large ! { ( u32x16, ) u32 , 16 }
203202 wmul_impl_large ! { ( u64x2, u64x4, u64x8, ) u64 , 32 }
204203}
205204
0 commit comments