25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 28 #if __cplusplus >= 201703L 30 #if !_GLIBCXX_SIMD_X86INTRIN 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE
40 template <
typename _Tp,
size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
44 return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np
>>(
48 template <
typename _TV,
50 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
51 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
52 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
53 __to_masktype(_TV __x)
54 {
return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>
>(__x); }
58 template <
typename _Ap,
typename _Bp,
typename _Tp = common_type_t<_Ap, _Bp>,
59 typename _Trait = _VectorTraits<_Tp>>
60 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
61 __interleave128_lo(
const _Ap& __av,
const _Bp& __bv)
65 if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
66 return _Tp{__a[0], __b[0]};
67 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
68 return _Tp{__a[0], __b[0], __a[1], __b[1]};
69 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
70 return _Tp{__a[0], __b[0], __a[1], __b[1],
71 __a[2], __b[2], __a[3], __b[3]};
72 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
73 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
74 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
75 __a[6], __b[6], __a[7], __b[7]};
76 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
77 return _Tp{__a[0], __b[0], __a[2], __b[2]};
78 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
79 return _Tp{__a[0], __b[0], __a[1], __b[1],
80 __a[4], __b[4], __a[5], __b[5]};
81 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
82 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
83 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
84 __a[10], __b[10], __a[11], __b[11]};
85 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
86 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
87 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
88 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
89 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
90 __a[22], __b[22], __a[23], __b[23]};
91 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
92 return _Tp{__a[0], __b[0], __a[2], __b[2],
93 __a[4], __b[4], __a[6], __b[6]};
94 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
95 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
96 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
97 __a[12], __b[12], __a[13], __b[13]};
98 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
99 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
100 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
101 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
102 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
103 __a[26], __b[26], __a[27], __b[27]};
104 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
105 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
106 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
107 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
108 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
109 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
110 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
111 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
112 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
113 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
116 __assert_unreachable<_Tp>();
121 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
122 _GLIBCXX_SIMD_INTRINSIC constexpr
bool 125 if (!__builtin_is_constant_evaluated())
127 if constexpr (__have_avx)
129 if constexpr (_TVT::template _S_is<float, 8>)
130 return _mm256_testz_ps(__a, __a);
131 else if constexpr (_TVT::template _S_is<double, 4>)
132 return _mm256_testz_pd(__a, __a);
133 else if constexpr (
sizeof(_Tp) == 32)
134 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
135 else if constexpr (_TVT::template _S_is<
float>)
136 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
137 else if constexpr (_TVT::template _S_is<
double, 2>)
138 return _mm_testz_pd(__a, __a);
140 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
142 else if constexpr (__have_sse4_1)
143 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
144 __intrin_bitcast<__m128i>(__a));
146 else if constexpr (sizeof(_Tp) <= 8)
147 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
150 const auto __b = __vector_bitcast<_LLong>(__a);
151 if constexpr (
sizeof(__b) == 16)
152 return (__b[0] | __b[1]) == 0;
153 else if constexpr (sizeof(__b) == 32)
154 return __is_zero(__lo128(__b) | __hi128(__b));
155 else if constexpr (sizeof(__b) == 64)
156 return __is_zero(__lo256(__b) | __hi256(__b));
158 __assert_unreachable<_Tp>();
164 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
165 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST
int 168 if constexpr (
sizeof(_Tp) == 32)
170 if constexpr (_TVT::template _S_is<float>)
171 return _mm256_movemask_ps(__to_intrin(__a));
172 else if constexpr (_TVT::template _S_is<double>)
173 return _mm256_movemask_pd(__to_intrin(__a));
175 return _mm256_movemask_epi8(__to_intrin(__a));
177 else if constexpr (_TVT::template _S_is<float>)
178 return _mm_movemask_ps(__to_intrin(__a));
179 else if constexpr (_TVT::template _S_is<double>)
180 return _mm_movemask_pd(__to_intrin(__a));
182 return _mm_movemask_epi8(__to_intrin(__a));
187 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
188 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 189 __testz(_TI __a, _TI __b)
191 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
192 _TVT::_S_full_size>>);
193 if (!__builtin_is_constant_evaluated())
195 if constexpr (
sizeof(_TI) == 32)
197 if constexpr (_TVT::template _S_is<float>)
198 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
199 else if constexpr (_TVT::template _S_is<double>)
200 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
202 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
204 else if constexpr (_TVT::template _S_is<float> && __have_avx)
205 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
206 else if constexpr (_TVT::template _S_is<double> && __have_avx)
207 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
208 else if constexpr (__have_sse4_1)
209 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
210 __intrin_bitcast<__m128i>(__to_intrin(__b)));
212 return __movemask(0 == __and(__a, __b)) != 0;
215 return __is_zero(__and(__a, __b));
221 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
222 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 223 __testc(_TI __a, _TI __b)
225 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
226 _TVT::_S_full_size>>);
227 if (__builtin_is_constant_evaluated())
228 return __is_zero(__andnot(__a, __b));
230 if constexpr (
sizeof(_TI) == 32)
232 if constexpr (_TVT::template _S_is<float>)
233 return _mm256_testc_ps(__a, __b);
234 else if constexpr (_TVT::template _S_is<double>)
235 return _mm256_testc_pd(__a, __b);
237 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
239 else if constexpr (_TVT::template _S_is<float> && __have_avx)
240 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
241 else if constexpr (_TVT::template _S_is<double> && __have_avx)
242 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
245 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
246 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
247 __intrin_bitcast<__m128i>(__to_intrin(__b)));
253 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
254 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 255 __testnzc(_TI __a, _TI __b)
257 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
258 _TVT::_S_full_size>>);
259 if (!__builtin_is_constant_evaluated())
261 if constexpr (
sizeof(_TI) == 32)
263 if constexpr (_TVT::template _S_is<float>)
264 return _mm256_testnzc_ps(__a, __b);
265 else if constexpr (_TVT::template _S_is<double>)
266 return _mm256_testnzc_pd(__a, __b);
268 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
270 else if constexpr (_TVT::template _S_is<float> && __have_avx)
271 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
272 else if constexpr (_TVT::template _S_is<double> && __have_avx)
273 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
274 else if constexpr (__have_sse4_1)
275 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
276 __intrin_bitcast<__m128i>(__to_intrin(__b)));
278 return __movemask(0 == __and(__a, __b)) == 0
279 && __movemask(0 == __andnot(__a, __b)) == 0;
282 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
289 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
290 _GLIBCXX_SIMD_INTRINSIC _Tp
293 if constexpr (
sizeof(_Tp) == 16)
296 is_floating_point_v<typename _TVT::value_type>, float,
int>>(__a);
297 return reinterpret_cast<_Tp
>(
298 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
300 else if constexpr (
sizeof(_Tp) == 32)
303 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
304 return reinterpret_cast<_Tp
>(
305 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
307 else if constexpr (
sizeof(_Tp) == 64)
310 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
311 return reinterpret_cast<_Tp
>(decltype(__x){__x[0], __x[1], __x[4],
312 __x[5], __x[2], __x[3],
316 __assert_unreachable<_Tp>();
321 template <
typename _Tp>
322 _GLIBCXX_SIMD_INTRINSIC
auto 323 __maskload_epi32(
const int* __ptr, _Tp __k)
325 if constexpr (
sizeof(__k) == 16)
326 return _mm_maskload_epi32(__ptr, __k);
328 return _mm256_maskload_epi32(__ptr, __k);
333 template <typename _Tp>
334 _GLIBCXX_SIMD_INTRINSIC auto
335 __maskload_epi64(const _LLong* __ptr, _Tp __k)
337 if constexpr (
sizeof(__k) == 16)
338 return _mm_maskload_epi64(__ptr, __k);
340 return _mm256_maskload_epi64(__ptr, __k);
345 template <typename _Tp>
346 _GLIBCXX_SIMD_INTRINSIC auto
347 __maskload_ps(const
float* __ptr, _Tp __k)
349 if constexpr (
sizeof(__k) == 16)
350 return _mm_maskload_ps(__ptr, __k);
352 return _mm256_maskload_ps(__ptr, __k);
357 template <typename _Tp>
358 _GLIBCXX_SIMD_INTRINSIC auto
359 __maskload_pd(const
double* __ptr, _Tp __k)
361 if constexpr (
sizeof(__k) == 16)
362 return _mm_maskload_pd(__ptr, __k);
364 return _mm256_maskload_pd(__ptr, __k);
369 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 370 #include "simd_x86_conversions.h" 374 template <
typename _Tp,
size_t _Np>
380 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
383 template <
typename _Tp,
size_t _Np>
389 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
392 template <
typename _Tp,
size_t _Np>
398 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
401 template <
typename _Tp,
size_t _Np>
407 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
410 template <
typename _Tp,
size_t _Np>
414 return __have_avx512f
416 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
419 template <
typename _Tp,
size_t _Np>
423 return __have_avx512f
425 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
429 struct _MaskImplX86Mixin;
432 struct _CommonImplX86 : _CommonImplBuiltin
434 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 436 template <
typename _From,
typename _To,
size_t _ToSize>
437 static constexpr
bool _S_converts_via_decomposition()
439 if constexpr (is_integral_v<
440 _From> && is_integral_v<_To> &&
sizeof(_From) == 8
442 return (sizeof(_To) == 2 && !__have_ssse3)
443 || (sizeof(_To) == 1 && !__have_avx512f);
444 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
445 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
447 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
450 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
452 return (sizeof(_To) == 4 && _ToSize == 16)
453 || (sizeof(_To) == 8 && _ToSize < 64);
458 template <typename _From, typename _To,
size_t _ToSize>
459 static inline constexpr
bool __converts_via_decomposition_v
460 = _S_converts_via_decomposition<_From, _To, _ToSize>();
465 using _CommonImplBuiltin::_S_store;
467 template <
typename _Tp,
size_t _Np>
468 _GLIBCXX_SIMD_INTRINSIC
static void _S_store(_SimdWrapper<_Tp, _Np> __x,
471 constexpr
size_t _Bytes = _Np *
sizeof(_Tp);
473 if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
475 const auto __v = __to_intrin(__x);
477 if constexpr (_Bytes & 1)
479 if constexpr (_Bytes < 16)
480 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
481 __intrin_bitcast<__m128i>(__v));
482 else if constexpr (_Bytes < 32)
483 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
484 __intrin_bitcast<__m256i>(__v));
486 _mm512_mask_storeu_epi8(__addr,
487 0xffffffffffffffffull >> (64 - _Bytes),
488 __intrin_bitcast<__m512i>(__v));
490 else if constexpr (_Bytes & 2)
492 if constexpr (_Bytes < 16)
493 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
494 __intrin_bitcast<__m128i>(__v));
495 else if constexpr (_Bytes < 32)
496 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
497 __intrin_bitcast<__m256i>(__v));
499 _mm512_mask_storeu_epi16(__addr,
500 0xffffffffull >> (32 - _Bytes / 2),
501 __intrin_bitcast<__m512i>(__v));
503 else if constexpr (_Bytes & 4)
505 if constexpr (_Bytes < 16)
506 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
507 __intrin_bitcast<__m128i>(__v));
508 else if constexpr (_Bytes < 32)
509 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
510 __intrin_bitcast<__m256i>(__v));
512 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
513 __intrin_bitcast<__m512i>(__v));
519 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 520 "- 1)) != 0 is impossible");
521 if constexpr (_Bytes < 32)
522 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
523 __intrin_bitcast<__m256i>(__v));
525 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
526 __intrin_bitcast<__m512i>(__v));
530 _CommonImplBuiltin::_S_store(__x, __addr);
535 template <
size_t _Np,
bool _Sanitized>
536 _GLIBCXX_SIMD_INTRINSIC static constexpr
void 537 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x,
bool* __mem)
539 if constexpr (__have_avx512bw_vl)
540 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr {
541 if constexpr (_Np <= 16)
542 return _mm_movm_epi8(__x._M_to_bits());
543 else if constexpr (_Np <= 32)
544 return _mm256_movm_epi8(__x._M_to_bits());
545 else if constexpr (_Np <= 64)
546 return _mm512_movm_epi8(__x._M_to_bits());
548 __assert_unreachable<_SizeConstant<_Np>>();
551 else if constexpr (__have_bmi2)
553 if constexpr (_Np <= 4)
554 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
556 __execute_n_times<__div_roundup(_Np, sizeof(
size_t))>(
558 constexpr
size_t __offset = __i *
sizeof(size_t);
559 constexpr
int __todo =
std::min(
sizeof(
size_t), _Np - __offset);
560 if constexpr (__todo == 1)
561 __mem[__offset] = __x[__offset];
566 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
567 0x0101010101010101ULL);
570 __x.template _M_extract<__offset>()._M_to_bits(),
573 _S_store<__todo>(__bools, __mem + __offset);
577 else if constexpr (__have_sse2 && _Np > 7)
578 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) {
579 constexpr
int __offset = __i * 16;
580 constexpr
int __todo =
std::min(16,
int(_Np) - __offset);
581 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
582 __vector_type16_t<_UChar> __bools;
583 if constexpr (__have_avx512f)
586 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
587 __vector_broadcast<16>(1)));
589 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
590 __todo > 8 ? __hi256(__as32bits)
592 __bools = __vector_bitcast<_UChar>(
593 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
597 using _V = __vector_type_t<_UChar, 16>;
598 auto __tmp = _mm_cvtsi32_si128(__bits);
599 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
600 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
601 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
602 _V __tmp2 =
reinterpret_cast<_V
>(__tmp);
603 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
604 1, 2, 4, 8, 16, 32, 64, 128};
605 __bools = (__tmp2 == 0) + 1;
607 _S_store<__todo>(__bools, __mem + __offset);
610 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
619 template <
typename _Kp,
typename _TV>
620 _GLIBCXX_SIMD_INTRINSIC
static _TV
621 _S_blend_avx512(
const _Kp __k,
const _TV __a,
const _TV __b) noexcept
625 return __k ? __a : __b;
627 static_assert(__is_vector_type_v<_TV>);
628 using _Tp =
typename _VectorTraits<_TV>::value_type;
629 static_assert(
sizeof(_TV) >= 16);
630 static_assert(
sizeof(_Tp) <= 8);
632 = conditional_t<(sizeof(_Tp) > 2),
634 conditional_t<sizeof(_Tp) == 1, char, short>>;
635 [[maybe_unused]]
const auto __aa = __vector_bitcast<_IntT>(__a);
636 [[maybe_unused]]
const auto __bb = __vector_bitcast<_IntT>(__b);
637 if constexpr (
sizeof(_TV) == 64)
639 if constexpr (
sizeof(_Tp) == 1)
640 return reinterpret_cast<_TV>(
641 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
642 else if constexpr (sizeof(_Tp) == 2)
643 return reinterpret_cast<_TV>(
644 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
645 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
646 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
647 else if constexpr (sizeof(_Tp) == 4)
648 return reinterpret_cast<_TV>(
649 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
650 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
651 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
652 else if constexpr (sizeof(_Tp) == 8)
653 return reinterpret_cast<_TV>(
654 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
656 else if constexpr (sizeof(_TV) == 32)
658 if constexpr (
sizeof(_Tp) == 1)
659 return reinterpret_cast<_TV>(
660 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
661 else if constexpr (sizeof(_Tp) == 2)
662 return reinterpret_cast<_TV>(
663 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
664 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
665 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
666 else if constexpr (sizeof(_Tp) == 4)
667 return reinterpret_cast<_TV>(
668 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
669 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
670 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
671 else if constexpr (sizeof(_Tp) == 8)
672 return reinterpret_cast<_TV>(
673 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
675 else if constexpr (sizeof(_TV) == 16)
677 if constexpr (
sizeof(_Tp) == 1)
678 return reinterpret_cast<_TV>(
679 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
680 else if constexpr (sizeof(_Tp) == 2)
681 return reinterpret_cast<_TV>(
682 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
683 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
684 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
685 else if constexpr (sizeof(_Tp) == 4)
686 return reinterpret_cast<_TV>(
687 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
688 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
689 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
690 else if constexpr (sizeof(_Tp) == 8)
691 return reinterpret_cast<_TV>(
692 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
703 template <
typename _Tp>
704 _GLIBCXX_SIMD_INTRINSIC
static _Tp _S_blend_intrin(_Tp __k, _Tp __a,
707 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
710 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
711 __m128 __k)
const noexcept
713 return __builtin_ia32_blendvps(__a, __b, __k);
715 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
716 __m128d __k)
const noexcept
718 return __builtin_ia32_blendvpd(__a, __b, __k);
720 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
721 __m128i __k)
const noexcept
723 return reinterpret_cast<__m128i
>(
724 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
725 reinterpret_cast<__v16qi>(__b),
726 reinterpret_cast<__v16qi>(__k)));
728 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
729 __m256 __k)
const noexcept
731 return __builtin_ia32_blendvps256(__a, __b, __k);
733 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
734 __m256d __k)
const noexcept
736 return __builtin_ia32_blendvpd256(__a, __b, __k);
738 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
739 __m256i __k)
const noexcept
741 if constexpr (__have_avx2)
742 return reinterpret_cast<__m256i
>(
743 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
744 reinterpret_cast<__v32qi>(__b),
745 reinterpret_cast<__v32qi>(__k)));
747 return reinterpret_cast<__m256i
>(
748 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
749 reinterpret_cast<__v8sf>(__b),
750 reinterpret_cast<__v8sf>(__k)));
753 return __eval(__a, __b, __k);
760 template <
typename _Tp,
size_t _Np>
761 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
762 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
763 _SimdWrapper<_Tp, _Np> __at1)
765 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
766 if (__k._M_is_constprop() && __at0._M_is_constprop()
767 && __at1._M_is_constprop())
768 return __generate_from_n_evaluations<_Np,
769 __vector_type_t<_Tp, _Np>>([&](
770 auto __i) constexpr {
return __k[__i] ? __at1[__i] : __at0[__i]; });
771 else if constexpr (
sizeof(__at0) == 64
772 || (__have_avx512vl &&
sizeof(__at0) >= 16))
773 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
776 static_assert((__have_avx512vl &&
sizeof(__at0) < 16)
777 || !__have_avx512vl);
778 constexpr
size_t __size = (__have_avx512vl ? 16 : 64) /
sizeof(_Tp);
779 return __vector_bitcast<_Tp, _Np>(
780 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
781 __vector_bitcast<_Tp, __size>(__at1)));
785 template <
typename _Tp,
size_t _Np>
786 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
787 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
788 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
790 const auto __kk = __wrapper_bitcast<_Tp>(__k);
791 if (__builtin_is_constant_evaluated()
792 || (__kk._M_is_constprop() && __at0._M_is_constprop()
793 && __at1._M_is_constprop()))
795 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
796 if (__r._M_is_constprop())
799 if constexpr (((__have_avx512f &&
sizeof(__at0) == 64) || __have_avx512vl)
800 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
803 _SimdWrapper<bool, _Np>(
804 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
812 if constexpr (__have_sse4_1)
813 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
816 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
825 template <
typename _Abi>
826 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
828 using _Base = _SimdImplBuiltin<_Abi>;
830 template <
typename _Tp>
831 using _MaskMember =
typename _Base::template _MaskMember<_Tp>;
833 template <
typename _Tp>
834 static constexpr
size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
836 template <
typename _Tp>
837 static constexpr
size_t _S_size = _Abi::template _S_size<_Tp>;
839 template <
typename _Tp>
840 static constexpr
size_t _S_max_store_size
841 = (
sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
842 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
844 using _MaskImpl =
typename _Abi::_MaskImpl;
847 template <
typename _Tp,
size_t _Np,
typename _Up>
848 static inline _SimdWrapper<_Tp, _Np>
849 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
850 const _Up* __mem) noexcept
852 static_assert(_Np == _S_size<_Tp>);
853 if constexpr (is_same_v<_Tp, _Up> ||
854 (
sizeof(_Tp) ==
sizeof(_Up)
856 _Tp> == is_integral_v<_Up>)
860 [[maybe_unused]]
const auto __intrin = __to_intrin(__merge);
861 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
864 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
865 if constexpr (
sizeof(__intrin) == 16)
866 __merge = __vector_bitcast<_Tp, _Np>(
867 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
868 else if constexpr (sizeof(__merge) == 32)
869 __merge = __vector_bitcast<_Tp, _Np>(
870 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
871 else if constexpr (sizeof(__merge) == 64)
872 __merge = __vector_bitcast<_Tp, _Np>(
873 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
875 __assert_unreachable<_Tp>();
877 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
880 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
881 if constexpr (
sizeof(__intrin) == 16)
882 __merge = __vector_bitcast<_Tp, _Np>(
883 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
884 else if constexpr (sizeof(__intrin) == 32)
885 __merge = __vector_bitcast<_Tp, _Np>(
886 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
887 else if constexpr (sizeof(__intrin) == 64)
888 __merge = __vector_bitcast<_Tp, _Np>(
889 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
891 __assert_unreachable<_Tp>();
893 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
894 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
896 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
897 if constexpr (
sizeof(__intrin) == 16)
898 __merge = __vector_bitcast<_Tp, _Np>(
899 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
900 else if constexpr (sizeof(__intrin) == 32)
901 __merge = __vector_bitcast<_Tp, _Np>(
902 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
903 else if constexpr (sizeof(__intrin) == 64)
904 __merge = __vector_bitcast<_Tp, _Np>(
905 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
907 __assert_unreachable<_Tp>();
909 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
910 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
912 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
913 if constexpr (
sizeof(__intrin) == 16)
914 __merge = __vector_bitcast<_Tp, _Np>(
915 _mm_mask_loadu_ps(__intrin, __kk, __mem));
916 else if constexpr (sizeof(__intrin) == 32)
917 __merge = __vector_bitcast<_Tp, _Np>(
918 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
919 else if constexpr (sizeof(__intrin) == 64)
920 __merge = __vector_bitcast<_Tp, _Np>(
921 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
923 __assert_unreachable<_Tp>();
925 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
926 && is_integral_v<_Up>)
928 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
930 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
931 __vector_bitcast<_Tp, _Np>(
932 __maskload_epi32(reinterpret_cast<const int*>(__mem),
935 else if constexpr (__have_avx &&
sizeof(_Tp) == 4)
937 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
939 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
940 __vector_bitcast<_Tp, _Np>(
941 __maskload_ps(reinterpret_cast<const float*>(__mem),
944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945 &&
sizeof(_Tp) == 8 && is_integral_v<_Up>)
947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948 if constexpr (
sizeof(__intrin) == 16)
949 __merge = __vector_bitcast<_Tp, _Np>(
950 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
951 else if constexpr (sizeof(__intrin) == 32)
952 __merge = __vector_bitcast<_Tp, _Np>(
953 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
954 else if constexpr (sizeof(__intrin) == 64)
955 __merge = __vector_bitcast<_Tp, _Np>(
956 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
958 __assert_unreachable<_Tp>();
960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964 if constexpr (
sizeof(__intrin) == 16)
965 __merge = __vector_bitcast<_Tp, _Np>(
966 _mm_mask_loadu_pd(__intrin, __kk, __mem));
967 else if constexpr (sizeof(__intrin) == 32)
968 __merge = __vector_bitcast<_Tp, _Np>(
969 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
970 else if constexpr (sizeof(__intrin) == 64)
971 __merge = __vector_bitcast<_Tp, _Np>(
972 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
974 __assert_unreachable<_Tp>();
976 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
977 && is_integral_v<_Up>)
979 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
983 reinterpret_cast<const _LLong*>(__mem),
986 else if constexpr (__have_avx &&
sizeof(_Tp) == 8)
988 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991 __vector_bitcast<_Tp, _Np>(
992 __maskload_pd(reinterpret_cast<const double*>(__mem),
996 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
998 __merge._M_set(__i, static_cast<_Tp>(
1027 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1033 template <
typename _Tp,
size_t _Np>
1034 _GLIBCXX_SIMD_INTRINSIC
static void 1035 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1036 _SimdWrapper<bool, _Np> __k)
1038 [[maybe_unused]]
const auto __vi = __to_intrin(__v);
1039 if constexpr (
sizeof(__vi) == 64)
1041 static_assert(
sizeof(__v) == 64 && __have_avx512f);
1042 if constexpr (__have_avx512bw &&
sizeof(_Tp) == 1)
1043 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1044 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1045 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1046 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1048 if constexpr (is_integral_v<_Tp>)
1049 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1051 _mm512_mask_storeu_ps(__mem, __k, __vi);
1053 else if constexpr (__have_avx512f &&
sizeof(_Tp) == 8)
1055 if constexpr (is_integral_v<_Tp>)
1056 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1058 _mm512_mask_storeu_pd(__mem, __k, __vi);
1060 #if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32 1062 else if constexpr (__have_sse2)
1064 using _M = __vector_type_t<_Tp, _Np>;
1065 using _MVT = _VectorTraits<_M>;
1066 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1067 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1068 reinterpret_cast<char*>(__mem));
1069 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1070 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1071 __k._M_data >> 1 * _MVT::_S_full_size)),
1072 reinterpret_cast<char*>(__mem) + 1 * 16);
1073 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1074 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1075 __k._M_data >> 2 * _MVT::_S_full_size)),
1076 reinterpret_cast<char*>(__mem) + 2 * 16);
1077 if constexpr (_Np > 48 /
sizeof(_Tp))
1078 _mm_maskmoveu_si128(
1079 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1080 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1081 __k._M_data >> 3 * _MVT::_S_full_size)),
1082 reinterpret_cast<char*>(__mem) + 3 * 16);
1086 __assert_unreachable<_Tp>();
1088 else if constexpr (
sizeof(__vi) == 32)
1090 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1091 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1092 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1093 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1094 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1096 if constexpr (is_integral_v<_Tp>)
1097 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1099 _mm256_mask_storeu_ps(__mem, __k, __vi);
1101 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1103 if constexpr (is_integral_v<_Tp>)
1104 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1106 _mm256_mask_storeu_pd(__mem, __k, __vi);
1108 else if constexpr (__have_avx512f
1109 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1112 _S_masked_store_nocvt(
1113 _SimdWrapper64<_Tp>(
1114 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1115 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1118 _S_masked_store_nocvt(__v, __mem,
1119 _MaskImpl::template _S_to_maskvector<
1120 __int_for_sizeof_t<_Tp>, _Np>(__k));
1122 else if constexpr (
sizeof(__vi) == 16)
1124 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1125 _mm_mask_storeu_epi8(__mem, __k, __vi);
1126 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1127 _mm_mask_storeu_epi16(__mem, __k, __vi);
1128 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1130 if constexpr (is_integral_v<_Tp>)
1131 _mm_mask_storeu_epi32(__mem, __k, __vi);
1133 _mm_mask_storeu_ps(__mem, __k, __vi);
1135 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1137 if constexpr (is_integral_v<_Tp>)
1138 _mm_mask_storeu_epi64(__mem, __k, __vi);
1140 _mm_mask_storeu_pd(__mem, __k, __vi);
1142 else if constexpr (__have_avx512f
1143 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1146 _S_masked_store_nocvt(
1147 _SimdWrapper64<_Tp>(
1148 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1149 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1152 _S_masked_store_nocvt(__v, __mem,
1153 _MaskImpl::template _S_to_maskvector<
1154 __int_for_sizeof_t<_Tp>, _Np>(__k));
1157 __assert_unreachable<_Tp>();
1160 template <
typename _Tp,
size_t _Np>
1161 _GLIBCXX_SIMD_INTRINSIC
static void 1162 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1163 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1165 if constexpr (
sizeof(__v) <= 16)
1167 [[maybe_unused]]
const auto __vi
1168 = __intrin_bitcast<__m128i>(__as_vector(__v));
1169 [[maybe_unused]]
const auto __ki
1170 = __intrin_bitcast<__m128i>(__as_vector(__k));
1171 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1172 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1173 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1174 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1175 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1176 && is_integral_v<_Tp>)
1177 _mm_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1178 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1179 _mm_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1180 __vector_bitcast<
float>(__vi));
1181 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1182 && is_integral_v<_Tp>)
1183 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1184 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1185 _mm_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1186 __vector_bitcast<
double>(__vi));
1187 else if constexpr (__have_sse2)
1188 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<
char*>(__mem));
1190 else if constexpr (sizeof(__v) == 32)
1192 [[maybe_unused]]
const auto __vi
1193 = __intrin_bitcast<__m256i>(__as_vector(__v));
1194 [[maybe_unused]]
const auto __ki
1195 = __intrin_bitcast<__m256i>(__as_vector(__k));
1196 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1197 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1198 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1199 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1200 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1201 && is_integral_v<_Tp>)
1202 _mm256_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1203 else if constexpr (sizeof(_Tp) == 4)
1204 _mm256_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1205 __vector_bitcast<
float>(__v));
1206 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1207 && is_integral_v<_Tp>)
1208 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1210 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1211 _mm256_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1212 __vector_bitcast<
double>(__v));
1213 else if constexpr (__have_sse2)
1215 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1216 reinterpret_cast<char*>(__mem));
1217 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1218 reinterpret_cast<char*>(__mem) + 16);
1222 __assert_unreachable<_Tp>();
1227 template <
typename _Tp,
size_t _Np,
typename _Up>
1228 _GLIBCXX_SIMD_INTRINSIC
static void 1229 _S_masked_store(
const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1230 const _MaskMember<_Tp> __k) noexcept
1232 if constexpr (is_integral_v<
1233 _Tp> && is_integral_v<_Up> &&
sizeof(_Tp) >
sizeof(_Up)
1234 && __have_avx512f && (
sizeof(_Tp) >= 4 || __have_avx512bw)
1235 && (
sizeof(__v) == 64 || __have_avx512vl))
1237 const auto __vi = __to_intrin(__v);
1238 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1239 if constexpr (
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4
1240 &&
sizeof(__vi) == 64)
1241 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1242 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1243 && sizeof(__vi) == 32)
1244 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1245 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1246 && sizeof(__vi) == 16)
1247 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1248 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1249 && sizeof(__vi) == 64)
1250 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1251 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1252 && sizeof(__vi) == 32)
1253 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1254 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1255 && sizeof(__vi) == 16)
1256 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1257 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1258 && sizeof(__vi) == 64)
1259 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1260 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1261 && sizeof(__vi) == 32)
1262 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1263 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1264 && sizeof(__vi) == 16)
1265 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1266 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1267 && sizeof(__vi) == 64)
1268 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1269 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1270 && sizeof(__vi) == 32)
1271 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1272 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1273 && sizeof(__vi) == 16)
1274 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1275 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1276 && sizeof(__vi) == 64)
1277 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1278 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1279 && sizeof(__vi) == 32)
1280 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1281 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1282 && sizeof(__vi) == 16)
1283 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1284 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1285 && sizeof(__vi) == 64)
1286 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1287 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1288 && sizeof(__vi) == 32)
1289 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1290 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1291 && sizeof(__vi) == 16)
1292 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1294 __assert_unreachable<_Tp>();
1297 _Base::_S_masked_store(__v, __mem, __k);
1302 template <typename _V, typename _VVT = _VectorTraits<_V>>
1303 _GLIBCXX_SIMD_INTRINSIC static constexpr _V _S_multiplies(_V __x, _V __y)
1305 using _Tp =
typename _VVT::value_type;
1306 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1307 || __y._M_is_constprop())
1308 return __as_vector(__x) * __as_vector(__y);
1309 else if constexpr (
sizeof(_Tp) == 1)
1311 if constexpr (
sizeof(_V) == 2)
1313 const auto __xs =
reinterpret_cast<short>(__x._M_data);
1314 const auto __ys =
reinterpret_cast<short>(__y._M_data);
1315 return reinterpret_cast<__vector_type_t<_Tp, 2>
>(short(
1316 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1318 else if constexpr (
sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1320 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1321 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1322 return reinterpret_cast<__vector_type_t<_Tp, 3>
>(
1323 ((__xi * __yi) & 0xff)
1324 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1325 | ((__xi >> 16) * (__yi & 0xff0000)));
1327 else if constexpr (
sizeof(_V) == 4)
1329 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1330 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1331 return reinterpret_cast<__vector_type_t<_Tp, 4>
>(
1332 ((__xi * __yi) & 0xff)
1333 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1334 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1335 | ((__xi >> 24) * (__yi & 0xff000000u)));
1337 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1338 && is_signed_v<_Tp>)
1339 return __convert<typename _VVT::type>(
1340 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1341 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1342 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1343 && is_unsigned_v<_Tp>)
1344 return __convert<typename _VVT::type>(
1345 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1346 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1350 constexpr
size_t __full_size = _VVT::_S_full_size;
1351 constexpr
int _Np =
sizeof(_V) >= 16 ? __full_size / 2 : 8;
1352 using _ShortW = _SimdWrapper<short, _Np>;
1353 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1354 * __vector_bitcast<short, _Np>(__y);
1355 _ShortW __high_byte = _ShortW()._M_data - 256;
1358 = (__vector_bitcast<short, _Np>(__x) >> 8)
1359 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1360 if constexpr (__have_avx512bw &&
sizeof(_V) > 2)
1361 return _CommonImplX86::_S_blend_avx512(
1362 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1363 __vector_bitcast<_Tp>(__odd));
1364 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1365 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1367 __to_intrin(__even),
1368 __to_intrin(__odd));
1371 __or(__andnot(__high_byte, __even), __odd));
1375 return _Base::_S_multiplies(__x, __y);
1380 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1381 template <
typename _Tp,
size_t _Np>
1382 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
1383 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1385 if (!__builtin_is_constant_evaluated()
1386 && !__builtin_constant_p(__y._M_data))
1387 if constexpr (is_integral_v<_Tp> &&
sizeof(_Tp) <= 4)
1406 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1407 constexpr
size_t __n_intermediate
1408 =
std::min(_Np, (__have_avx512f ? 64
1412 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1413 constexpr
size_t __n_floatv
1414 = __div_roundup(_Np, __n_intermediate);
1415 using _R = __vector_type_t<_Tp, _Np>;
1416 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1417 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1418 _Abi::__make_padding_nonzero(__as_vector(__y)));
1419 return __call_with_n_evaluations<__n_floatv>(
1420 [](
auto... __quotients) {
1421 return __vector_convert<_R>(__quotients...);
1424 &__yf](
auto __i) -> _SimdWrapper<_Float, __n_intermediate> {
1425 #if !defined __clang__ && __GCC_IEC_559 == 0 1431 if constexpr (__have_avx)
1435 if constexpr (
sizeof(_Tp) == 4)
1436 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}
" 1438 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1440 asm("vdivps\t{%2, %1, %0|%0, %1, %2}
" 1442 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1447 if constexpr (sizeof(_Tp) == 4) 1448 asm("divpd\t{%1, %0|%0, %1}
" 1452 asm("divps\t{%1, %0|%0, %1}
" 1458 return __xf[__i] / __yf[__i]; 1462 /* 64-bit int division is potentially optimizable via double division if 1463 * the value in __x is small enough and the conversion between 1464 * int<->double is efficient enough: 1465 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1468 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1470 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1471 0xffe0'0000'0000'0000ull})) 1473 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1478 return _Base::_S_divides(__x, __y); 1480 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1484 template <typename _Tp, size_t _Np> 1485 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1486 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1488 if (__builtin_is_constant_evaluated() 1489 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1490 return _Base::_S_modulus(__x, __y); 1492 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1496 // _S_bit_shift_left {{{ 1497 // Notes on UB. C++2a [expr.shift] says: 1498 // -1- [...] The operands shall be of integral or unscoped enumeration type 1499 // and integral promotions are performed. The type of the result is that 1500 // of the promoted left operand. The behavior is undefined if the right 1501 // operand is negative, or greater than or equal to the width of the 1502 // promoted left operand. 1503 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1504 // 2^N, where N is the width of the type of the result. 1506 // C++17 [expr.shift] says: 1507 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1508 // bits are zero-filled. If E1 has an unsigned type, the value of the 1509 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1510 // representable in the result type. Otherwise, if E1 has a signed type 1511 // and non-negative value, and E1 × 2^E2 is representable in the 1512 // corresponding unsigned type of the result type, then that value, 1513 // converted to the result type, is the resulting value; otherwise, the 1514 // behavior is undefined. 1517 // With C++2a signed and unsigned types have the same UB 1519 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1521 // With C++17 there's little room for optimizations because the standard 1522 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1523 // short and char shifts must assume shifts affect bits of neighboring 1525 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1526 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1527 inline _GLIBCXX_CONST static typename _TVT::type 1528 _S_bit_shift_left(_Tp __xx, int __y) 1530 using _V = typename _TVT::type; 1531 using _Up = typename _TVT::value_type; 1533 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1534 if (__builtin_is_constant_evaluated()) 1536 #if __cplusplus > 201703 1537 // after C++17, signed shifts have no UB, and behave just like unsigned 1539 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1540 return __vector_bitcast<_Up>( 1541 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1544 else if constexpr (sizeof(_Up) == 1) 1546 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1547 if (__builtin_constant_p(__y)) 1558 else if (__y > 2 && __y < 8) 1560 if constexpr (sizeof(__x) > sizeof(unsigned)) 1562 const _UChar __mask = 0xff << __y; // precomputed vector 1563 return __vector_bitcast<_Up>( 1564 __vector_bitcast<_UChar>( 1565 __vector_bitcast<unsigned>(__x) << __y) 1570 const unsigned __mask 1571 = (0xff & (0xff << __y)) * 0x01010101u; 1572 return reinterpret_cast<_V>( 1573 static_cast<__int_for_sizeof_t<_V>>( 1575 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1580 else if (__y >= 8 && __y < 32) 1583 __builtin_unreachable(); 1585 // general strategy in the following: use an sllv instead of sll 1586 // instruction, because it's 2 to 4 times faster: 1587 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1588 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1589 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1590 _mm256_set1_epi16(__y)))); 1591 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1592 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1593 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1594 _mm512_set1_epi16(__y)))); 1595 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1597 const auto __shift = _mm512_set1_epi16(__y); 1598 return __vector_bitcast<_Up>( 1599 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1600 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1601 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1602 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1604 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1607 const auto __shift = _mm_cvtsi32_si128(__y); 1609 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1610 __k |= _mm256_srli_epi16(__k, 8); 1611 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1614 const _Up __k = 0xff << __y; 1615 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y) 1621 const auto __shift = _mm_cvtsi32_si128(__y); 1623 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1624 __k |= _mm_srli_epi16(__k, 8); 1625 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1631 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1632 inline _GLIBCXX_CONST static typename _TVT::type 1633 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1635 using _V = typename _TVT::type; 1636 using _Up = typename _TVT::value_type; 1638 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1639 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1640 if (__builtin_is_constant_evaluated()) 1642 #if __cplusplus > 201703 1643 // after C++17, signed shifts have no UB, and behave just like unsigned 1645 else if constexpr (is_signed_v<_Up>) 1646 return __vector_bitcast<_Up>( 1647 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1648 __vector_bitcast<make_unsigned_t<_Up>>(__y))); 1650 else if constexpr (sizeof(_Up) == 1) 1652 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1653 return __vector_bitcast<_Up>(__concat( 1654 _mm512_cvtepi16_epi8( 1655 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1656 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1657 _mm512_cvtepi16_epi8( 1658 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1659 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1660 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1661 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1662 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1663 _mm512_cvtepu8_epi16(__iy)))); 1664 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1665 return __intrin_bitcast<_V>( 1666 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1667 _mm_cvtepu8_epi16(__iy)))); 1668 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1669 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1670 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1671 _mm256_cvtepu8_epi16(__iy)))); 1672 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1673 return __intrin_bitcast<_V>( 1674 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1675 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1676 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1677 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1680 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5); 1682 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1684 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1685 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1688 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1690 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1691 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1693 auto __x1 = __x + __x; 1694 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1695 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1697 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1699 else if constexpr (sizeof(__x) == 16) 1702 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5); 1704 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1706 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1709 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1711 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1713 auto __x1 = __x + __x; 1714 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1716 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1721 else if constexpr (sizeof(_Up) == 2) 1723 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1724 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1725 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1726 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1727 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1728 return __vector_bitcast<_Up>( 1729 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1730 _mm512_castsi256_si512(__iy)))); 1731 else if constexpr (sizeof __ix == 32 && __have_avx2) 1733 const auto __ux = __vector_bitcast<unsigned>(__x); 1734 const auto __uy = __vector_bitcast<unsigned>(__y); 1735 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1736 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1737 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1739 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1740 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1741 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1742 return __intrin_bitcast<_V>( 1743 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1744 _mm512_castsi128_si512(__iy)))); 1745 else if constexpr (sizeof __ix == 16 && __have_avx2) 1747 const auto __ux = __vector_bitcast<unsigned>(__ix); 1748 const auto __uy = __vector_bitcast<unsigned>(__iy); 1749 return __intrin_bitcast<_V>(_mm_blend_epi16( 1750 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1751 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1753 else if constexpr (sizeof __ix == 16) 1755 using _Float4 = __vector_type_t<float, 4>; 1756 using _Int4 = __vector_type_t<int, 4>; 1757 using _UInt4 = __vector_type_t<unsigned, 4>; 1759 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1761 * __intrin_bitcast<_V>( 1762 __vector_convert<_Int4>(_SimdWrapper<float, 4>( 1763 reinterpret_cast<_Float4>(__yu << 23))) 1764 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>( 1765 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1769 __assert_unreachable<_Tp>(); 1771 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1773 // latency is suboptimal, but throughput is at full speedup 1774 return __intrin_bitcast<_V>( 1775 __vector_bitcast<unsigned>(__ix) 1776 * __vector_convert<__vector_type16_t<int>>( 1777 _SimdWrapper<float, 4>(__vector_bitcast<float>( 1778 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000)))); 1779 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1782 const auto __lo = _mm_sll_epi64(__ix, __iy); 1784 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1785 if constexpr (__have_sse4_1) 1786 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1788 return __vector_bitcast<_Up>( 1789 _mm_move_sd(__vector_bitcast<double>(__hi), 1790 __vector_bitcast<double>(__lo))); 1795 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1798 // _S_bit_shift_right {{{ 1799 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1800 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1801 inline _GLIBCXX_CONST static typename _TVT::type 1802 _S_bit_shift_right(_Tp __xx, int __y) 1804 using _V = typename _TVT::type; 1805 using _Up = typename _TVT::value_type; 1807 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1808 if (__builtin_is_constant_evaluated()) 1810 else if (__builtin_constant_p(__y) 1812 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1814 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1815 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1818 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1819 return __intrin_bitcast<_V>( 1820 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix) 1823 | (__vector_bitcast<_UShort>( 1824 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8) 1828 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1829 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1832 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32) 1833 & _Up(0xffff'ffff'0000'0000ull)) 1834 | __vector_bitcast<_Up>( 1835 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix) 1839 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1841 | __vector_bitcast<_Up>( 1842 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll) 1850 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1851 inline _GLIBCXX_CONST static typename _TVT::type 1852 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1854 using _V = typename _TVT::type; 1855 using _Up = typename _TVT::value_type; 1857 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1858 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1859 if (__builtin_is_constant_evaluated() 1860 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1862 else if constexpr (sizeof(_Up) == 1) //{{{ 1864 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1865 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1866 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1867 _mm_cvtepi8_epi16(__iy)) 1868 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1869 _mm_cvtepu8_epi16(__iy)))); 1870 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1871 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1873 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1874 _mm256_cvtepi8_epi16(__iy)) 1875 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1876 _mm256_cvtepu8_epi16(__iy)))); 1877 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1878 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1880 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1881 _mm512_cvtepi8_epi16(__iy)) 1882 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1883 _mm512_cvtepu8_epi16(__iy)))); 1884 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1885 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1886 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1887 0x5555'5555'5555'5555ull, 1889 _mm512_slli_epi16(__ix, 8), 1890 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1891 _mm512_set1_epi16(8))))); 1892 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1893 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1894 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1895 0x5555'5555'5555'5555ull, 1897 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1898 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1899 /* This has better throughput but higher latency than the impl below 1900 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1903 const auto __shorts = __to_intrin(_S_bit_shift_right( 1904 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1905 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1906 return __vector_bitcast<_Up>( 1907 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1910 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1911 // the following uses vpsr[al]vd, which requires AVX2 1912 if constexpr (is_signed_v<_Up>) 1914 const auto r3 = __vector_bitcast<_UInt>( 1915 (__vector_bitcast<int>(__x) 1916 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1919 = __vector_bitcast<_UInt>( 1920 ((__vector_bitcast<int>(__x) << 8) 1921 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1924 = __vector_bitcast<_UInt>( 1925 ((__vector_bitcast<int>(__x) << 16) 1926 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1928 const auto r0 = __vector_bitcast<_UInt>( 1929 (__vector_bitcast<int>(__x) << 24) 1930 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1931 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1936 const auto r3 = (__vector_bitcast<_UInt>(__x) 1937 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1940 = ((__vector_bitcast<_UInt>(__x) << 8) 1941 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1944 = ((__vector_bitcast<_UInt>(__x) << 16) 1945 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 1948 = (__vector_bitcast<_UInt>(__x) << 24) 1949 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 1950 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1953 else if constexpr (__have_sse4_1 1954 && is_unsigned_v<_Up> && sizeof(__x) > 2) 1956 auto __x128 = __vector_bitcast<_Up>(__ix); 1958 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 1959 auto __x4 = __vector_bitcast<_Up>( 1960 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 1961 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1962 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 1964 auto __x2 = __vector_bitcast<_Up>( 1965 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 1966 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1967 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 1969 auto __x1 = __vector_bitcast<_Up>( 1970 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 1971 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1972 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 1973 return __intrin_bitcast<_V>( 1975 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 1976 == 0)); // y > 7 nulls the result 1978 else if constexpr (__have_sse4_1 1979 && is_signed_v<_Up> && sizeof(__x) > 2) 1981 auto __mask = __vector_bitcast<_UChar>( 1982 __vector_bitcast<_UShort>(__iy) << 5); 1983 auto __maskl = [&]() { 1984 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 1986 auto __xh = __vector_bitcast<short>(__ix); 1987 auto __xl = __vector_bitcast<short>(__ix) << 8; 1988 auto __xh4 = __xh >> 4; 1989 auto __xl4 = __xl >> 4; 1990 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 1991 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 1992 __xl = __vector_bitcast<short>( 1993 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 1994 __to_intrin(__xl4))); 1996 auto __xh2 = __xh >> 2; 1997 auto __xl2 = __xl >> 2; 1998 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 1999 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2000 __xl = __vector_bitcast<short>( 2001 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2002 __to_intrin(__xl2))); 2004 auto __xh1 = __xh >> 1; 2005 auto __xl1 = __xl >> 1; 2006 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2007 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2008 __xl = __vector_bitcast<short>( 2009 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2010 __to_intrin(__xl1))); 2011 return __intrin_bitcast<_V>( 2012 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2013 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2015 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2016 == 0)); // y > 7 nulls the result 2018 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2021 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2022 auto __x4 = __vector_bitcast<_Up>( 2023 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2024 __x = __mask > 0x7f ? __x4 : __x; 2026 auto __x2 = __vector_bitcast<_Up>( 2027 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2028 __x = __mask > 0x7f ? __x2 : __x; 2030 auto __x1 = __vector_bitcast<_Up>( 2031 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2032 __x = __mask > 0x7f ? __x1 : __x; 2034 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2036 else if constexpr (sizeof(__x) > 2) // signed SSE2 2038 static_assert(is_signed_v<_Up>); 2039 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2040 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2041 auto __xh = __vector_bitcast<short>(__x); 2042 auto __xl = __vector_bitcast<short>(__x) << 8; 2043 auto __xh4 = __xh >> 4; 2044 auto __xl4 = __xl >> 4; 2045 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2046 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2049 auto __xh2 = __xh >> 2; 2050 auto __xl2 = __xl >> 2; 2051 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2052 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2055 auto __xh1 = __xh >> 1; 2056 auto __xl1 = __xl >> 1; 2057 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2058 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2059 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2060 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2063 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2068 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2070 [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) { 2071 if constexpr (sizeof(__a) == 16) 2072 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2074 else if constexpr (sizeof(__a) == 32) 2075 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2077 else if constexpr (sizeof(__a) == 64) 2078 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2081 __assert_unreachable<decltype(__a)>(); 2083 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2084 return __intrin_bitcast<_V>(is_signed_v<_Up> 2085 ? _mm_srav_epi16(__ix, __iy) 2086 : _mm_srlv_epi16(__ix, __iy)); 2087 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2088 return __vector_bitcast<_Up>(is_signed_v<_Up> 2089 ? _mm256_srav_epi16(__ix, __iy) 2090 : _mm256_srlv_epi16(__ix, __iy)); 2091 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2092 return __vector_bitcast<_Up>(is_signed_v<_Up> 2093 ? _mm512_srav_epi16(__ix, __iy) 2094 : _mm512_srlv_epi16(__ix, __iy)); 2095 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2096 return __intrin_bitcast<_V>( 2097 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16) 2098 >> (__vector_bitcast<int>(__iy) & 0xffffu)) 2100 __vector_bitcast<int>(__ix) 2101 >> (__vector_bitcast<int>(__iy) >> 16))); 2102 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2103 return __intrin_bitcast<_V>( 2104 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2105 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2106 __vector_bitcast<_UInt>(__ix) 2107 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2108 else if constexpr (__have_sse4_1) 2110 auto __mask = __vector_bitcast<_UShort>(__iy); 2111 auto __x128 = __vector_bitcast<_Up>(__ix); 2113 __mask = (__mask << 3) | (__mask << 11); 2114 // do __x128 = 0 where __y[4] is set 2115 __x128 = __vector_bitcast<_Up>( 2116 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2117 __to_intrin(__mask))); 2118 // do __x128 =>> 8 where __y[3] is set 2119 __x128 = __vector_bitcast<_Up>( 2120 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2121 __to_intrin(__mask += __mask))); 2122 // do __x128 =>> 4 where __y[2] is set 2123 __x128 = __vector_bitcast<_Up>( 2124 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2125 __to_intrin(__mask += __mask))); 2126 // do __x128 =>> 2 where __y[1] is set 2127 __x128 = __vector_bitcast<_Up>( 2128 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2129 __to_intrin(__mask += __mask))); 2130 // do __x128 =>> 1 where __y[0] is set 2131 return __intrin_bitcast<_V>( 2132 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2133 __to_intrin(__mask + __mask))); 2137 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2138 auto __x128 = __vector_bitcast<_Up>(__ix); 2139 auto __mask = [](__vector_type16_t<_UShort> __kk) { 2140 return __vector_bitcast<short>(__kk) < 0; 2142 // do __x128 = 0 where __y[4] is set 2143 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2144 // do __x128 =>> 8 where __y[3] is set 2145 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2146 // do __x128 =>> 4 where __y[2] is set 2147 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2148 // do __x128 =>> 2 where __y[1] is set 2149 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2150 // do __x128 =>> 1 where __y[0] is set 2151 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2155 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2157 if constexpr (is_unsigned_v<_Up>) 2159 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2160 const __m128 __factor_f = reinterpret_cast<__m128>( 2161 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23)); 2162 const __m128i __factor 2163 = __builtin_constant_p(__factor_f) 2165 __make_vector<unsigned>(__factor_f[0], __factor_f[1], 2166 __factor_f[2], __factor_f[3])) 2167 : _mm_cvttps_epi32(__factor_f); 2169 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2170 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2171 _mm_srli_si128(__factor, 4)); 2172 if constexpr (__have_sse4_1) 2173 return __intrin_bitcast<_V>( 2174 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2176 return __intrin_bitcast<_V>( 2177 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2181 auto __shift = [](auto __a, auto __b) { 2182 if constexpr (is_signed_v<_Up>) 2183 return _mm_sra_epi32(__a, __b); 2185 return _mm_srl_epi32(__a, __b); 2188 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2189 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2191 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2192 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2193 if constexpr (__have_sse4_1) 2194 return __intrin_bitcast<_V>( 2195 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2196 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2198 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2199 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2200 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2206 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2211 template <typename _Tp, size_t _Np> 2212 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2213 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2215 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2217 if (__builtin_is_constant_evaluated() 2218 || (__x._M_is_constprop() && __y._M_is_constprop())) 2219 return _MaskImpl::_S_to_bits( 2220 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2222 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2223 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2224 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2225 if constexpr (is_floating_point_v<_Tp>) 2227 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2228 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2229 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2230 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2231 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2232 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2233 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2234 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2235 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2236 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2237 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2238 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2240 __assert_unreachable<_Tp>(); 2242 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2243 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2244 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2245 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2246 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2247 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2248 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2249 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2250 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2251 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2252 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2253 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2254 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2255 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2257 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2258 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2259 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2260 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2261 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2263 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2265 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2267 __assert_unreachable<_Tp>(); 2269 else if (__builtin_is_constant_evaluated()) 2270 return _Base::_S_equal_to(__x, __y); 2271 else if constexpr (sizeof(__x) == 8) // {{{ 2273 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2274 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2275 _MaskMember<_Tp> __r64; 2276 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2280 return _Base::_S_equal_to(__x, __y); 2284 // _S_not_equal_to {{{ 2285 template <typename _Tp, size_t _Np> 2286 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2287 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2289 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2291 if (__builtin_is_constant_evaluated() 2292 || (__x._M_is_constprop() && __y._M_is_constprop())) 2293 return _MaskImpl::_S_to_bits( 2294 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2296 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2297 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2298 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2299 if constexpr (is_floating_point_v<_Tp>) 2301 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2302 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2304 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2306 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2308 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2309 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2310 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2312 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2314 __assert_unreachable<_Tp>(); 2316 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2317 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2318 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2319 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2320 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2321 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2322 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2323 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2324 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2325 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2326 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2327 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2328 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2329 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2331 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2332 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2333 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2334 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2335 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2337 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2338 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2339 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2341 __assert_unreachable<_Tp>(); 2343 else if constexpr (!__builtin_is_constant_evaluated() // {{{ 2344 && sizeof(__x) == 8) 2346 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2347 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2348 _MaskMember<_Tp> __r64; 2349 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2353 return _Base::_S_not_equal_to(__x, __y); 2358 template <typename _Tp, size_t _Np> 2359 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2360 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2362 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2364 if (__builtin_is_constant_evaluated() 2365 || (__x._M_is_constprop() && __y._M_is_constprop())) 2366 return _MaskImpl::_S_to_bits( 2367 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2369 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2370 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2371 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2372 if constexpr (sizeof(__xi) == 64) 2374 if constexpr (is_same_v<_Tp, float>) 2375 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2376 else if constexpr (is_same_v<_Tp, double>) 2377 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2378 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2379 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2380 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2381 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2382 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2383 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2384 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2385 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2386 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2387 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2388 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2389 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2390 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2391 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2392 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2393 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2395 __assert_unreachable<_Tp>(); 2397 else if constexpr (sizeof(__xi) == 32) 2399 if constexpr (is_same_v<_Tp, float>) 2400 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2401 else if constexpr (is_same_v<_Tp, double>) 2402 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2403 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2404 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2405 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2406 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2407 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2408 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2409 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2410 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2411 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2412 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2413 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2414 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2415 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2416 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2417 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2418 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2420 __assert_unreachable<_Tp>(); 2422 else if constexpr (sizeof(__xi) == 16) 2424 if constexpr (is_same_v<_Tp, float>) 2425 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2426 else if constexpr (is_same_v<_Tp, double>) 2427 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2428 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2429 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2430 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2431 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2432 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2433 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2435 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2436 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2437 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2438 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2439 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2440 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2441 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2443 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2445 __assert_unreachable<_Tp>(); 2448 __assert_unreachable<_Tp>(); 2450 else if constexpr (!__builtin_is_constant_evaluated() // {{{ 2451 && sizeof(__x) == 8) 2453 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2454 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2455 _MaskMember<_Tp> __r64; 2456 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2460 return _Base::_S_less(__x, __y); 2464 // _S_less_equal {{{ 2465 template <typename _Tp, size_t _Np> 2466 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2467 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2469 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2471 if (__builtin_is_constant_evaluated() 2472 || (__x._M_is_constprop() && __y._M_is_constprop())) 2473 return _MaskImpl::_S_to_bits( 2474 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2476 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2477 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2478 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2479 if constexpr (sizeof(__xi) == 64) 2481 if constexpr (is_same_v<_Tp, float>) 2482 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2483 else if constexpr (is_same_v<_Tp, double>) 2484 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2485 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2486 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2487 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2488 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2489 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2490 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2491 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2492 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2493 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2494 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2495 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2496 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2497 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2498 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2499 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2500 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2502 __assert_unreachable<_Tp>(); 2504 else if constexpr (sizeof(__xi) == 32) 2506 if constexpr (is_same_v<_Tp, float>) 2507 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2508 else if constexpr (is_same_v<_Tp, double>) 2509 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2510 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2511 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2512 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2513 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2514 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2515 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2517 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2518 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2519 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2520 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2521 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2522 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2523 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2525 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2527 __assert_unreachable<_Tp>(); 2529 else if constexpr (sizeof(__xi) == 16) 2531 if constexpr (is_same_v<_Tp, float>) 2532 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2533 else if constexpr (is_same_v<_Tp, double>) 2534 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2535 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2536 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2538 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2540 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2542 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2543 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2544 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2546 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2548 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2550 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2552 __assert_unreachable<_Tp>(); 2555 __assert_unreachable<_Tp>(); 2557 else if constexpr (!__builtin_is_constant_evaluated() // {{{ 2558 && sizeof(__x) == 8) 2560 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2561 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2562 _MaskMember<_Tp> __r64; 2563 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2567 return _Base::_S_less_equal(__x, __y); 2572 template <typename _Tp, size_t _Np> 2573 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2574 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2576 if constexpr (__is_avx512_abi<_Abi>()) 2577 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2579 return _Base::_S_negate(__x); 2584 using _Base::_S_abs; 2587 template <typename _Tp, size_t _Np> 2588 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2589 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2591 if constexpr (__is_sse_ps<_Tp, _Np>()) 2592 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2593 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2594 return _mm_sqrt_pd(__x); 2595 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2596 return _mm256_sqrt_ps(__x); 2597 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2598 return _mm256_sqrt_pd(__x); 2599 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2600 return _mm512_sqrt_ps(__x); 2601 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2602 return _mm512_sqrt_pd(__x); 2604 __assert_unreachable<_Tp>(); 2609 template <typename _Tp, size_t _Np> 2610 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2611 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2612 __fixed_size_storage_t<int, _Np> __exp) 2614 if constexpr (__is_avx512_abi<_Abi>()) 2616 const auto __xi = __to_intrin(__x); 2617 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi> 2619 const auto __expi = __to_intrin(__cvt(__exp)); 2620 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2621 if constexpr (sizeof(__xi) == 16) 2623 if constexpr (sizeof(_Tp) == 8) 2624 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2626 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2628 else if constexpr (sizeof(__xi) == 32) 2630 if constexpr (sizeof(_Tp) == 8) 2631 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2633 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2637 static_assert(sizeof(__xi) == 64); 2638 if constexpr (sizeof(_Tp) == 8) 2639 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2641 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2645 return _Base::_S_ldexp(__x, __exp); 2650 template <typename _Tp, size_t _Np> 2651 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2652 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2654 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2655 return _mm512_roundscale_ps(__x, 0x0b); 2656 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2657 return _mm512_roundscale_pd(__x, 0x0b); 2658 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2659 return _mm256_round_ps(__x, 0x3); 2660 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2661 return _mm256_round_pd(__x, 0x3); 2662 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2663 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3)); 2664 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2665 return _mm_round_pd(__x, 0x3); 2666 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2669 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2670 const auto __no_fractional_values 2671 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x)) 2673 < 0x4b000000; // the exponent is so large that no mantissa bits 2674 // signify fractional values (0x3f8 + 23*8 = 2676 return __no_fractional_values ? __truncated : __to_intrin(__x); 2679 return _Base::_S_trunc(__x); 2684 template <typename _Tp, size_t _Np> 2685 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2686 _S_round(_SimdWrapper<_Tp, _Np> __x) 2688 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2689 // from zero as required by std::round. Therefore this function is more 2691 using _V = __vector_type_t<_Tp, _Np>; 2693 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2694 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2695 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2696 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2697 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2698 __truncated = _mm256_round_ps(__x._M_data, 2699 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2700 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2701 __truncated = _mm256_round_pd(__x._M_data, 2702 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2703 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2704 __truncated = __auto_bitcast( 2705 _mm_round_ps(__to_intrin(__x), 2706 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2707 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2709 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2710 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2711 __truncated = __auto_bitcast( 2712 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2714 return _Base::_S_round(__x); 2716 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2717 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2721 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2722 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2724 if constexpr (__have_sse4_1) 2726 else // adjust for missing range in cvttps_epi32 2727 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2733 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2734 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x) noexcept 2736 if constexpr (_TVT::template _S_is<float, 16>) 2737 return _mm512_roundscale_ps(__x, 0x0c); 2738 else if constexpr (_TVT::template _S_is<double, 8>) 2739 return _mm512_roundscale_pd(__x, 0x0c); 2740 else if constexpr (_TVT::template _S_is<float, 8>) 2741 return _mm256_round_ps(__x, 2742 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2743 else if constexpr (_TVT::template _S_is<double, 4>) 2744 return _mm256_round_pd(__x, 2745 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2746 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2747 return _mm_round_ps(__x, 2748 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2749 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2750 return _mm_round_pd(__x, 2751 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2753 return _Base::_S_nearbyint(__x); 2758 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2759 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x) noexcept 2761 if constexpr (_TVT::template _S_is<float, 16>) 2762 return _mm512_roundscale_ps(__x, 0x04); 2763 else if constexpr (_TVT::template _S_is<double, 8>) 2764 return _mm512_roundscale_pd(__x, 0x04); 2765 else if constexpr (_TVT::template _S_is<float, 8>) 2766 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2767 else if constexpr (_TVT::template _S_is<double, 4>) 2768 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2769 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2770 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2771 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2772 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2774 return _Base::_S_rint(__x); 2779 template <typename _Tp, size_t _Np> 2780 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2781 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2783 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2784 return _mm512_roundscale_ps(__x, 0x09); 2785 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2786 return _mm512_roundscale_pd(__x, 0x09); 2787 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2788 return _mm256_round_ps(__x, 0x1); 2789 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2790 return _mm256_round_pd(__x, 0x1); 2791 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2792 return __auto_bitcast(_mm_floor_ps(__to_intrin(__x))); 2793 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2794 return _mm_floor_pd(__x); 2796 return _Base::_S_floor(__x); 2801 template <typename _Tp, size_t _Np> 2802 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2803 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2805 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2806 return _mm512_roundscale_ps(__x, 0x0a); 2807 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2808 return _mm512_roundscale_pd(__x, 0x0a); 2809 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2810 return _mm256_round_ps(__x, 0x2); 2811 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2812 return _mm256_round_pd(__x, 0x2); 2813 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2814 return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x))); 2815 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2816 return _mm_ceil_pd(__x); 2818 return _Base::_S_ceil(__x); 2823 template <typename _Tp, size_t _Np> 2824 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2825 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2827 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2829 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2830 return _mm512_movepi32_mask( 2831 __intrin_bitcast<__m512i>(__x._M_data)); 2832 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2833 return _mm512_movepi64_mask( 2834 __intrin_bitcast<__m512i>(__x._M_data)); 2835 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2836 return _mm256_movepi32_mask( 2837 __intrin_bitcast<__m256i>(__x._M_data)); 2838 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2839 return _mm256_movepi64_mask( 2840 __intrin_bitcast<__m256i>(__x._M_data)); 2841 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2842 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2843 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2844 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2846 else if constexpr (__is_avx512_abi<_Abi>()) 2848 const auto __xi = __to_intrin(__x); 2849 [[maybe_unused]] constexpr auto __k1 2850 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2851 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2852 return _mm_movemask_ps(__xi); 2853 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2854 return _mm_movemask_pd(__xi); 2855 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2856 return _mm256_movemask_ps(__xi); 2857 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2858 return _mm256_movemask_pd(__xi); 2859 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2860 return _mm512_mask_cmplt_epi32_mask( 2861 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2862 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2863 return _mm512_mask_cmplt_epi64_mask( 2864 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2866 __assert_unreachable<_Tp>(); 2869 return _Base::_S_signbit(__x); 2871 using _I = __int_for_sizeof_t<_Tp>; 2872 if constexpr (sizeof(__x) == 64) 2873 return _S_less(__vector_bitcast<_I>(__x), _I()); 2876 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2877 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2878 if constexpr ((sizeof(_Tp) == 4 && 2879 (__have_avx2 || sizeof(__x) == 16)) || 2882 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2884 else if constexpr ((__have_avx2 || 2885 (__have_ssse3 && sizeof(__x) == 16))) 2887 return __vector_bitcast<_Tp>((__xx & __signmask) == 2891 { // SSE2/3 or AVX (w/o AVX2) 2892 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2893 return __vector_bitcast<_Tp>( 2894 __vector_bitcast<_Tp>( 2895 (__xx & __signmask) | 2896 __vector_bitcast<_I>(__one)) // -1 or 1 2904 // _S_isnonzerovalue_mask {{{ 2905 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2906 template <typename _Tp> 2907 _GLIBCXX_SIMD_INTRINSIC static auto _S_isnonzerovalue_mask(_Tp __x) 2909 using _Traits = _VectorTraits<_Tp>; 2910 if constexpr (__have_avx512dq_vl) 2912 if constexpr (_Traits::template _S_is< 2913 float, 2> || _Traits::template _S_is<float, 4>) 2914 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2915 else if constexpr (_Traits::template _S_is<float, 8>) 2916 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2917 else if constexpr (_Traits::template _S_is<float, 16>) 2918 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2919 else if constexpr (_Traits::template _S_is<double, 2>) 2920 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2921 else if constexpr (_Traits::template _S_is<double, 4>) 2922 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2923 else if constexpr (_Traits::template _S_is<double, 8>) 2924 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2926 __assert_unreachable<_Tp>(); 2930 using _Up = typename _Traits::value_type; 2931 constexpr size_t _Np = _Traits::_S_full_size; 2932 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2933 const auto __b = __x * _Up(); // NaN if __x == inf 2934 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2935 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2937 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 2939 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 2940 __auto_bitcast(__b), 2942 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 2943 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2944 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 2946 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2947 __auto_bitcast(__b), 2949 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 2950 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2951 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 2952 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 2953 __auto_bitcast(__b), 2955 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 2956 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2957 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 2959 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2960 __auto_bitcast(__b), 2962 else if constexpr (__is_avx512_ps<_Up, _Np>()) 2963 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2964 else if constexpr (__is_avx512_pd<_Up, _Np>()) 2965 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2967 __assert_unreachable<_Tp>(); 2973 template <typename _Tp, size_t _Np> 2974 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2975 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 2977 static_assert(is_floating_point_v<_Tp>); 2978 #if !__FINITE_MATH_ONLY__ 2979 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2981 const auto __xi = __to_intrin(__x); 2982 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2983 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2984 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 2985 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2986 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 2987 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2988 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 2989 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2990 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 2991 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2992 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 2993 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2994 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 2996 else if constexpr (__is_avx512_abi<_Abi>()) 2998 // if all exponent bits are set, __x is either inf or NaN 2999 using _I = __int_for_sizeof_t<_Tp>; 3000 const auto __inf = __vector_bitcast<_I>( 3001 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3002 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3006 return _Base::_S_isfinite(__x); 3011 template <typename _Tp, size_t _Np> 3012 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3013 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3015 #if !__FINITE_MATH_ONLY__ 3016 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3018 const auto __xi = __to_intrin(__x); 3019 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3020 return _mm512_fpclass_ps_mask(__xi, 0x18); 3021 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3022 return _mm512_fpclass_pd_mask(__xi, 0x18); 3023 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3024 return _mm256_fpclass_ps_mask(__xi, 0x18); 3025 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3026 return _mm256_fpclass_pd_mask(__xi, 0x18); 3027 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3028 return _mm_fpclass_ps_mask(__xi, 0x18); 3029 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3030 return _mm_fpclass_pd_mask(__xi, 0x18); 3032 __assert_unreachable<_Tp>(); 3034 else if constexpr (__have_avx512dq_vl) 3036 if constexpr (__is_sse_pd<_Tp, _Np>()) 3037 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3038 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3039 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3040 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3041 return _mm_movm_epi32( 3042 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3043 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3044 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3046 __assert_unreachable<_Tp>(); 3050 return _Base::_S_isinf(__x); 3055 template <typename _Tp, size_t _Np> 3056 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3057 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3059 #if __FINITE_MATH_ONLY__ 3060 [[maybe_unused]] constexpr int __mode = 0x26; 3062 [[maybe_unused]] constexpr int __mode = 0xbf; 3064 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3066 const auto __xi = __to_intrin(__x); 3067 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3068 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3069 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3070 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3071 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3072 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3073 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3074 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3075 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3076 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3077 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3078 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3079 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3081 __assert_unreachable<_Tp>(); 3083 else if constexpr (__have_avx512dq) 3085 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3086 return _mm_movm_epi32( 3087 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3088 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3089 return _mm256_movm_epi32( 3090 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3091 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3092 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3093 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3094 return _mm_movm_epi64( 3095 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3096 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3097 return _mm256_movm_epi64( 3098 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3099 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3100 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3102 __assert_unreachable<_Tp>(); 3104 else if constexpr (__is_avx512_abi<_Abi>()) 3106 using _I = __int_for_sizeof_t<_Tp>; 3107 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3108 const auto minn = __vector_bitcast<_I>( 3109 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3110 #if __FINITE_MATH_ONLY__ 3111 return _S_less_equal<_I, _Np>(minn, absn); 3114 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3115 return __and(_S_less_equal<_I, _Np>(minn, absn), 3116 _S_less<_I, _Np>(absn, infn)); 3120 return _Base::_S_isnormal(__x); 3125 template <typename _Tp, size_t _Np> 3126 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3127 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3128 { return _S_isunordered(__x, __x); } 3131 // _S_isunordered {{{ 3132 template <typename _Tp, size_t _Np> 3133 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3134 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3135 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3137 #if __FINITE_MATH_ONLY__ 3140 const auto __xi = __to_intrin(__x); 3141 const auto __yi = __to_intrin(__y); 3142 if constexpr (__is_avx512_abi<_Abi>()) 3144 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3145 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3146 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3147 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3148 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3149 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3150 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3151 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3152 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3153 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3154 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3155 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3156 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3158 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3159 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3160 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3161 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3162 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3163 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3164 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3165 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3167 __assert_unreachable<_Tp>(); 3173 template <typename _Tp, size_t _Np> 3174 static constexpr _MaskMember<_Tp> _S_isgreater(_SimdWrapper<_Tp, _Np> __x, 3175 _SimdWrapper<_Tp, _Np> __y) 3177 const auto __xi = __to_intrin(__x); 3178 const auto __yi = __to_intrin(__y); 3179 if constexpr (__is_avx512_abi<_Abi>()) 3181 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3182 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3183 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3184 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3185 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3186 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3187 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3188 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3189 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3190 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3191 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3192 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3193 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3195 __assert_unreachable<_Tp>(); 3197 else if constexpr (__have_avx) 3199 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3200 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3201 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3202 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3203 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3204 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3205 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3206 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3208 __assert_unreachable<_Tp>(); 3210 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3211 && sizeof(_Tp) == 4) 3213 const auto __xn = __vector_bitcast<int>(__xi); 3214 const auto __yn = __vector_bitcast<int>(__yi); 3215 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3216 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3217 return __auto_bitcast( 3218 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3220 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3221 && sizeof(_Tp) == 8) 3222 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3223 -_mm_ucomigt_sd(__xi, __yi), 3224 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3225 _mm_unpackhi_pd(__yi, __yi))}; 3227 return _Base::_S_isgreater(__x, __y); 3231 // _S_isgreaterequal {{{ 3232 template <typename _Tp, size_t _Np> 3233 static constexpr _MaskMember<_Tp> 3234 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3236 const auto __xi = __to_intrin(__x); 3237 const auto __yi = __to_intrin(__y); 3238 if constexpr (__is_avx512_abi<_Abi>()) 3240 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3241 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3242 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3243 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3244 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3245 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3246 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3247 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3248 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3249 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3250 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3251 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3252 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3254 __assert_unreachable<_Tp>(); 3256 else if constexpr (__have_avx) 3258 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3259 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3260 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3261 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3263 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3265 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3267 __assert_unreachable<_Tp>(); 3269 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3270 && sizeof(_Tp) == 4) 3272 const auto __xn = __vector_bitcast<int>(__xi); 3273 const auto __yn = __vector_bitcast<int>(__yi); 3274 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3275 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3276 return __auto_bitcast( 3277 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3279 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3280 && sizeof(_Tp) == 8) 3281 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3282 -_mm_ucomige_sd(__xi, __yi), 3283 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3284 _mm_unpackhi_pd(__yi, __yi))}; 3286 return _Base::_S_isgreaterequal(__x, __y); 3291 template <typename _Tp, size_t _Np> 3292 static constexpr _MaskMember<_Tp> _S_isless(_SimdWrapper<_Tp, _Np> __x, 3293 _SimdWrapper<_Tp, _Np> __y) 3295 const auto __xi = __to_intrin(__x); 3296 const auto __yi = __to_intrin(__y); 3297 if constexpr (__is_avx512_abi<_Abi>()) 3299 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3300 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3301 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3302 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3303 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3304 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3305 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3306 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3307 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3308 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3309 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3310 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3311 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3313 __assert_unreachable<_Tp>(); 3315 else if constexpr (__have_avx) 3317 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3318 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3319 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3320 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3321 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3322 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3323 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3324 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3326 __assert_unreachable<_Tp>(); 3328 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3329 && sizeof(_Tp) == 4) 3331 const auto __xn = __vector_bitcast<int>(__xi); 3332 const auto __yn = __vector_bitcast<int>(__yi); 3333 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3334 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3335 return __auto_bitcast( 3336 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3338 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3339 && sizeof(_Tp) == 8) 3340 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3341 -_mm_ucomigt_sd(__yi, __xi), 3342 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3343 _mm_unpackhi_pd(__xi, __xi))}; 3345 return _Base::_S_isless(__x, __y); 3349 // _S_islessequal {{{ 3350 template <typename _Tp, size_t _Np> 3351 static constexpr _MaskMember<_Tp> 3352 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3354 const auto __xi = __to_intrin(__x); 3355 const auto __yi = __to_intrin(__y); 3356 if constexpr (__is_avx512_abi<_Abi>()) 3358 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3359 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3360 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3361 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3362 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3363 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3364 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3365 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3366 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3367 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3368 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3369 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3370 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3372 __assert_unreachable<_Tp>(); 3374 else if constexpr (__have_avx) 3376 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3377 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3378 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3379 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3380 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3381 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3382 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3383 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3385 __assert_unreachable<_Tp>(); 3387 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3388 && sizeof(_Tp) == 4) 3390 const auto __xn = __vector_bitcast<int>(__xi); 3391 const auto __yn = __vector_bitcast<int>(__yi); 3392 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3393 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3394 return __auto_bitcast( 3395 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3397 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3398 && sizeof(_Tp) == 8) 3399 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3400 -_mm_ucomige_sd(__yi, __xi), 3401 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3402 _mm_unpackhi_pd(__xi, __xi))}; 3404 return _Base::_S_islessequal(__x, __y); 3408 // _S_islessgreater {{{ 3409 template <typename _Tp, size_t _Np> 3410 static constexpr _MaskMember<_Tp> 3411 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3413 const auto __xi = __to_intrin(__x); 3414 const auto __yi = __to_intrin(__y); 3415 if constexpr (__is_avx512_abi<_Abi>()) 3417 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3418 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3419 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3420 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3421 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3422 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3423 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3424 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3425 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3426 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3427 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3428 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3429 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3431 __assert_unreachable<_Tp>(); 3433 else if constexpr (__have_avx) 3435 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3436 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3437 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3438 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3439 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3440 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3441 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3442 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3444 __assert_unreachable<_Tp>(); 3446 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3447 return __auto_bitcast( 3448 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3449 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3450 return __to_masktype( 3451 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3453 __assert_unreachable<_Tp>(); 3460 // _MaskImplX86Mixin {{{ 3461 struct _MaskImplX86Mixin 3463 template <typename _Tp> 3464 using _TypeTag = _Tp*; 3466 using _Base = _MaskImplBuiltinMixin; 3468 // _S_to_maskvector(bool) {{{ 3469 template <typename _Up, size_t _ToN = 1, typename _Tp> 3470 _GLIBCXX_SIMD_INTRINSIC static constexpr enable_if_t< 3471 is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>> 3472 _S_to_maskvector(_Tp __x) 3474 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3475 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3476 : __vector_type_t<_Up, _ToN>(); 3480 // _S_to_maskvector(_SanitizedBitMask) {{{ 3481 template <typename _Up, size_t _UpN = 0, size_t _Np, 3482 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3483 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3484 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3486 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3487 using _UV = __vector_type_t<_Up, _ToN>; 3488 using _UI = __intrinsic_type_t<_Up, _ToN>; 3489 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3490 if constexpr (_Np == 1) 3491 return _S_to_maskvector<_Up, _ToN>(__k); 3492 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3493 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>( 3494 [&](auto __i) -> _Up { return -__x[__i.value]; }); 3495 else if constexpr (sizeof(_Up) == 1) 3497 if constexpr (sizeof(_UI) == 16) 3499 if constexpr (__have_avx512bw_vl) 3500 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3501 else if constexpr (__have_avx512bw) 3502 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3503 else if constexpr (__have_avx512f) 3505 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3507 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3508 __hi256(__as32bits))); 3509 return __intrin_bitcast<_UV>( 3510 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3512 else if constexpr (__have_ssse3) 3514 const auto __bitmask = __to_intrin( 3515 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3516 8, 16, 32, 64, 128)); 3517 return __intrin_bitcast<_UV>( 3518 __vector_bitcast<_Up>( 3519 _mm_shuffle_epi8(__to_intrin( 3520 __vector_type_t<_ULLong, 2>{__k}), 3521 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3522 1, 1, 1, 1, 1, 1, 1)) 3526 // else fall through 3528 else if constexpr (sizeof(_UI) == 32) 3530 if constexpr (__have_avx512bw_vl) 3531 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3532 else if constexpr (__have_avx512bw) 3533 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3534 else if constexpr (__have_avx512f) 3536 auto __as16bits = // 0 16 1 17 ... 15 31 3537 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3539 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3542 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3543 __lo256(__as16bits), 3544 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3547 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3548 __0_16_1_17, // 0 16 1 17 2 ... 3549 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3550 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3552 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3553 // 0-3 8-11 16-19 24-27 3554 // 4-7 12-15 20-23 28-31 3556 else if constexpr (__have_avx2) 3558 const auto __bitmask 3559 = _mm256_broadcastsi128_si256(__to_intrin( 3560 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3561 4, 8, 16, 32, 64, 128))); 3562 return __vector_bitcast<_Up>( 3563 __vector_bitcast<_Up>( 3564 _mm256_shuffle_epi8( 3565 _mm256_broadcastsi128_si256( 3566 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3567 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3568 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3573 // else fall through 3575 else if constexpr (sizeof(_UI) == 64) 3576 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3577 if constexpr (std::min(_ToN, _Np) <= 4) 3579 if constexpr (_Np > 7) // avoid overflow 3580 __x &= _SanitizedBitMask<_Np>(0x0f); 3581 const _UInt __char_mask 3582 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3585 __builtin_memcpy(&__r, &__char_mask, 3586 std::min(sizeof(__r), sizeof(__char_mask))); 3589 else if constexpr (std::min(_ToN, _Np) <= 7) 3591 if constexpr (_Np > 7) // avoid overflow 3592 __x &= _SanitizedBitMask<_Np>(0x7f); 3593 const _ULLong __char_mask 3594 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3597 __builtin_memcpy(&__r, &__char_mask, 3598 std::min(sizeof(__r), sizeof(__char_mask))); 3602 else if constexpr (sizeof(_Up) == 2) 3604 if constexpr (sizeof(_UI) == 16) 3606 if constexpr (__have_avx512bw_vl) 3607 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3608 else if constexpr (__have_avx512bw) 3609 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3610 else if constexpr (__have_avx512f) 3612 __m256i __as32bits = {}; 3613 if constexpr (__have_avx512vl) 3614 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3617 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3618 return __intrin_bitcast<_UV>( 3619 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3621 // else fall through 3623 else if constexpr (sizeof(_UI) == 32) 3625 if constexpr (__have_avx512bw_vl) 3626 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3627 else if constexpr (__have_avx512bw) 3628 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3629 else if constexpr (__have_avx512f) 3631 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3632 return __vector_bitcast<_Up>( 3633 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3634 __hi256(__as32bits)))); 3636 // else fall through 3638 else if constexpr (sizeof(_UI) == 64) 3639 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3641 else if constexpr (sizeof(_Up) == 4) 3643 if constexpr (sizeof(_UI) == 16) 3645 if constexpr (__have_avx512dq_vl) 3646 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3647 else if constexpr (__have_avx512dq) 3648 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3649 else if constexpr (__have_avx512vl) 3650 return __intrin_bitcast<_UV>( 3651 _mm_maskz_mov_epi32(__k, ~__m128i())); 3652 else if constexpr (__have_avx512f) 3653 return __intrin_bitcast<_UV>( 3654 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3655 // else fall through 3657 else if constexpr (sizeof(_UI) == 32) 3659 if constexpr (__have_avx512dq_vl) 3660 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3661 else if constexpr (__have_avx512dq) 3662 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3663 else if constexpr (__have_avx512vl) 3664 return __vector_bitcast<_Up>( 3665 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3666 else if constexpr (__have_avx512f) 3667 return __vector_bitcast<_Up>( 3668 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3669 // else fall through 3671 else if constexpr (sizeof(_UI) == 64) 3672 return __vector_bitcast<_Up>( 3673 __have_avx512dq ? _mm512_movm_epi32(__k) 3674 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3676 else if constexpr (sizeof(_Up) == 8) 3678 if constexpr (sizeof(_UI) == 16) 3680 if constexpr (__have_avx512dq_vl) 3681 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3682 else if constexpr (__have_avx512dq) 3683 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3684 else if constexpr (__have_avx512vl) 3685 return __vector_bitcast<_Up>( 3686 _mm_maskz_mov_epi64(__k, ~__m128i())); 3687 else if constexpr (__have_avx512f) 3688 return __vector_bitcast<_Up>( 3689 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3690 // else fall through 3692 else if constexpr (sizeof(_UI) == 32) 3694 if constexpr (__have_avx512dq_vl) 3695 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3696 else if constexpr (__have_avx512dq) 3697 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3698 else if constexpr (__have_avx512vl) 3699 return __vector_bitcast<_Up>( 3700 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3701 else if constexpr (__have_avx512f) 3702 return __vector_bitcast<_Up>( 3703 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3704 // else fall through 3706 else if constexpr (sizeof(_UI) == 64) 3707 return __vector_bitcast<_Up>( 3708 __have_avx512dq ? _mm512_movm_epi64(__k) 3709 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3712 using _UpUInt = make_unsigned_t<_Up>; 3713 using _V = __vector_type_t<_UpUInt, _ToN>; 3714 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3715 if constexpr (_ToN == 2) 3717 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3719 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3721 if constexpr (sizeof(_Up) == 4) 3722 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3723 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3724 _mm256_castsi256_ps(_mm256_setr_epi32( 3725 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3726 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3727 else if constexpr (sizeof(_Up) == 8) 3728 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3729 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3730 _mm256_castsi256_pd( 3731 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3732 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3734 __assert_unreachable<_Up>(); 3736 else if constexpr (__bits_per_element >= _ToN) 3738 constexpr auto __bitmask 3739 = __generate_vector<_V>([](auto __i) constexpr->_UpUInt { 3740 return __i < _ToN ? 1ull << __i : 0; 3743 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3744 if constexpr (__bits_per_element > _ToN) 3745 return __vector_bitcast<_Up>(__bits) > 0; 3747 return __vector_bitcast<_Up>(__bits != 0); 3752 = __generate_vector<_V>([&](auto __i) constexpr { 3753 return static_cast<_UpUInt>( 3754 __k >> (__bits_per_element * (__i / __bits_per_element))); 3756 & __generate_vector<_V>([](auto __i) constexpr { 3757 return static_cast<_UpUInt>(1ull 3758 << (__i % __bits_per_element)); 3759 }); // mask bit index 3760 return __intrin_bitcast<_UV>(__tmp != _V()); 3765 // _S_to_maskvector(_SimdWrapper) {{{ 3766 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, 3767 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3768 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3769 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3771 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3772 using _TW = _SimdWrapper<_Tp, _Np>; 3773 using _UW = _SimdWrapper<_Up, _ToN>; 3774 using _UI = __intrinsic_type_t<_Up, _ToN>; 3775 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3776 return _S_to_maskvector<_Up, _ToN>( 3777 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3778 // vector -> vector bitcast 3779 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3780 && sizeof(_TW) == sizeof(_UW)) 3781 return __wrapper_bitcast<_Up, _ToN>( 3784 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x)); 3785 else // vector -> vector {{{ 3787 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3789 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3790 return __generate_from_n_evaluations<std::min(_ToN, _Np), 3791 __vector_type_t<_Up, _ToN>>( 3792 [&](auto __i) -> _Up { return __y[__i.value]; }); 3794 using _To = __vector_type_t<_Up, _ToN>; 3795 [[maybe_unused]] constexpr size_t _FromN = _Np; 3796 constexpr int _FromBytes = sizeof(_Tp); 3797 constexpr int _ToBytes = sizeof(_Up); 3798 const auto __k = __x._M_data; 3800 if constexpr (_FromBytes == _ToBytes) 3801 return __intrin_bitcast<_To>(__k); 3802 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3804 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3805 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3806 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3809 = __vector_bitcast<int>(__interleave128_lo(__k, __k)); 3810 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3812 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3815 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3817 = __vector_bitcast<int>(__interleave128_lo(__y, __y)); 3818 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3820 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3822 return __intrin_bitcast<_To>( 3823 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3824 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3825 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3827 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3828 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3829 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3832 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3833 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3835 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3837 if constexpr (__have_sse2 && !__have_ssse3) 3838 return __intrin_bitcast<_To>(_mm_packs_epi32( 3839 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3842 return __intrin_bitcast<_To>( 3843 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3844 __vector_bitcast<_Up>(__k))); 3846 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3847 return __intrin_bitcast<_To>( 3848 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3849 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3850 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3851 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3853 return __intrin_bitcast<_To>( 3854 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3855 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3856 -1, -1, -1, -1, -1, -1, -1, 3858 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3861 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3862 __y = _mm_packs_epi32(__y, __m128i()); 3863 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3865 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3867 return __intrin_bitcast<_To>( 3868 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3869 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 3870 -1, -1, -1, -1, -1, -1, -1, 3872 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 3875 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3876 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3878 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 3879 return __intrin_bitcast<_To>( 3880 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 3882 __assert_unreachable<_Tp>(); 3884 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 3886 if constexpr (_FromBytes == _ToBytes) 3887 __assert_unreachable<_Tp>(); 3888 else if constexpr (_FromBytes == _ToBytes * 2) 3890 const auto __y = __vector_bitcast<_LLong>(__k); 3891 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 3892 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 3894 else if constexpr (_FromBytes == _ToBytes * 4) 3896 const auto __y = __vector_bitcast<_LLong>(__k); 3897 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 3898 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 3901 else if constexpr (_FromBytes == _ToBytes * 8) 3903 const auto __y = __vector_bitcast<_LLong>(__k); 3904 return __intrin_bitcast<_To>( 3905 _mm256_castsi128_si256(_mm_shuffle_epi8( 3906 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 3907 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 3908 -1, -1, -1, -1, -1)))); 3910 else if constexpr (_FromBytes * 2 == _ToBytes) 3912 auto __y = __xzyw(__to_intrin(__k)); 3913 if constexpr (is_floating_point_v< 3914 _Tp> || (!__have_avx2 && _FromBytes == 4)) 3916 const auto __yy = __vector_bitcast<float>(__y); 3917 return __intrin_bitcast<_To>( 3918 _mm256_unpacklo_ps(__yy, __yy)); 3921 return __intrin_bitcast<_To>( 3922 _mm256_unpacklo_epi8(__y, __y)); 3924 else if constexpr (_FromBytes * 4 == _ToBytes) 3927 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 3928 __lo128(__vector_bitcast<_LLong>( 3929 __k))); // drops 3/4 of input 3930 return __intrin_bitcast<_To>( 3931 __concat(_mm_unpacklo_epi16(__y, __y), 3932 _mm_unpackhi_epi16(__y, __y))); 3934 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3937 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 3938 __lo128(__vector_bitcast<_LLong>( 3939 __k))); // drops 3/4 of input 3941 = _mm_unpacklo_epi16(__y, 3942 __y); // drops another 1/2 => 7/8 total 3943 return __intrin_bitcast<_To>( 3944 __concat(_mm_unpacklo_epi32(__y, __y), 3945 _mm_unpackhi_epi32(__y, __y))); 3948 __assert_unreachable<_Tp>(); 3950 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 3952 if constexpr (_FromBytes == _ToBytes) 3953 return __intrin_bitcast<_To>( 3954 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 3955 __zero_extend(__to_intrin(__k)))); 3956 else if constexpr (_FromBytes * 2 == _ToBytes) 3958 return __intrin_bitcast<_To>( 3959 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 3960 __vector_bitcast<_LLong>(__k)), 3961 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 3962 __vector_bitcast<_LLong>(__k)))); 3964 else if constexpr (_FromBytes * 4 == _ToBytes) 3966 if constexpr (__have_avx2) 3968 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 3969 __concat(__vector_bitcast<_LLong>(__k), 3970 __vector_bitcast<_LLong>(__k)), 3971 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3972 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 3973 6, 6, 7, 7, 7, 7))); 3977 return __intrin_bitcast<_To>(__concat( 3978 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3979 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 3980 2, 2, 2, 2, 3, 3, 3, 3)), 3981 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3982 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 3983 6, 6, 6, 6, 7, 7, 7, 3987 else if constexpr (_FromBytes * 8 == _ToBytes) 3989 if constexpr (__have_avx2) 3991 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 3992 __concat(__vector_bitcast<_LLong>(__k), 3993 __vector_bitcast<_LLong>(__k)), 3994 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3995 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3996 3, 3, 3, 3, 3, 3))); 4000 return __intrin_bitcast<_To>(__concat( 4001 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4002 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4003 1, 1, 1, 1, 1, 1, 1, 1)), 4004 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4005 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4006 3, 3, 3, 3, 3, 3, 3, 4010 else if constexpr (_FromBytes == _ToBytes * 2) 4011 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4012 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4013 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4015 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4016 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4017 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4018 -1, -1, -1, -1, -1, -1, -1, 4021 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4023 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4024 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4025 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4026 -1, -1, -1, -1, -1, -1, -1, 4029 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4031 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4032 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4033 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4034 -1, -1, -1, -1, -1, -1, -1, 4038 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4040 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4042 if constexpr (_FromBytes == _ToBytes) 4044 return __intrin_bitcast<_To>(__lo128(__k)); 4046 else if constexpr (_FromBytes == _ToBytes * 2) 4048 auto __y = __vector_bitcast<_LLong>(__k); 4049 return __intrin_bitcast<_To>( 4050 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4052 else if constexpr (_FromBytes == _ToBytes * 4) 4054 auto __y = __vector_bitcast<_LLong>(__k); 4055 return __intrin_bitcast<_To>( 4056 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4059 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4061 auto __y = __vector_bitcast<_LLong>(__k); 4062 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4063 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4064 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4067 else if constexpr (_FromBytes * 2 == _ToBytes) 4069 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4070 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4072 else if constexpr (_FromBytes * 4 == _ToBytes) 4074 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4075 __y = _mm_unpacklo_epi8(__y, __y); 4076 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4078 else if constexpr (_FromBytes * 8 == _ToBytes) 4080 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4081 __y = _mm_unpacklo_epi8(__y, __y); 4082 __y = _mm_unpacklo_epi8(__y, __y); 4083 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4086 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4089 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4091 if constexpr (_FromBytes > _ToBytes) { 4092 const _To __y = __vector_bitcast<_Up>(__k); 4093 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4094 constexpr int _Stride = _FromBytes / _ToBytes; 4095 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4096 }(make_index_sequence<std::min(_ToN, _FromN)>()); 4098 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4099 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4100 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4102 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4103 constexpr int __dup = _ToBytes / _FromBytes; 4104 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4105 }(make_index_sequence<_FromN>()); 4113 template <typename _Tp, size_t _Np> 4114 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4115 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4117 if constexpr (is_same_v<_Tp, bool>) 4118 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4121 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4122 if (__builtin_is_constant_evaluated() 4123 || __builtin_constant_p(__x._M_data)) 4125 const auto __bools = -__x._M_data; 4126 const _ULLong __k = __call_with_n_evaluations<_Np>( 4127 [](auto... __bits) { return (__bits | ...); }, 4128 [&](auto __i) { return _ULLong(__bools[+__i]) << __i; }); 4129 if (__builtin_is_constant_evaluated() 4130 || __builtin_constant_p(__k)) 4133 const auto __xi = __to_intrin(__x); 4134 if constexpr (sizeof(_Tp) == 1) 4135 if constexpr (sizeof(__xi) == 16) 4136 if constexpr (__have_avx512bw_vl) 4137 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4138 else // implies SSE2 4139 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4140 else if constexpr (sizeof(__xi) == 32) 4141 if constexpr (__have_avx512bw_vl) 4142 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4143 else // implies AVX2 4144 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4145 else // implies AVX512BW 4146 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4148 else if constexpr (sizeof(_Tp) == 2) 4149 if constexpr (sizeof(__xi) == 16) 4150 if constexpr (__have_avx512bw_vl) 4151 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4152 else if constexpr (__have_avx512bw) 4153 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4154 else // implies SSE2 4155 return _BitMask<_Np>( 4156 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4157 else if constexpr (sizeof(__xi) == 32) 4158 if constexpr (__have_avx512bw_vl) 4159 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4160 else if constexpr (__have_avx512bw) 4161 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4162 else // implies SSE2 4163 return _BitMask<_Np>(_mm_movemask_epi8( 4164 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4165 else // implies AVX512BW 4166 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4168 else if constexpr (sizeof(_Tp) == 4) 4169 if constexpr (sizeof(__xi) == 16) 4170 if constexpr (__have_avx512dq_vl) 4171 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4172 else if constexpr (__have_avx512vl) 4173 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4174 else if constexpr (__have_avx512dq) 4175 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4176 else if constexpr (__have_avx512f) 4177 return _BitMask<_Np>( 4178 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4180 return _BitMask<_Np>( 4181 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4182 else if constexpr (sizeof(__xi) == 32) 4183 if constexpr (__have_avx512dq_vl) 4184 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4185 else if constexpr (__have_avx512dq) 4186 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4187 else if constexpr (__have_avx512vl) 4188 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4189 else if constexpr (__have_avx512f) 4190 return _BitMask<_Np>( 4191 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4193 return _BitMask<_Np>( 4194 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4195 else // implies AVX512?? 4196 if constexpr (__have_avx512dq) 4197 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4198 else // implies AVX512F 4199 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4201 else if constexpr (sizeof(_Tp) == 8) 4202 if constexpr (sizeof(__xi) == 16) 4203 if constexpr (__have_avx512dq_vl) 4204 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4205 else if constexpr (__have_avx512dq) 4206 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4207 else if constexpr (__have_avx512vl) 4208 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4209 else if constexpr (__have_avx512f) 4210 return _BitMask<_Np>( 4211 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4212 else // implies SSE2 4213 return _BitMask<_Np>( 4214 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4215 else if constexpr (sizeof(__xi) == 32) 4216 if constexpr (__have_avx512dq_vl) 4217 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4218 else if constexpr (__have_avx512dq) 4219 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4220 else if constexpr (__have_avx512vl) 4221 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4222 else if constexpr (__have_avx512f) 4223 return _BitMask<_Np>( 4224 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4226 return _BitMask<_Np>( 4227 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4228 else // implies AVX512?? 4229 if constexpr (__have_avx512dq) 4230 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4231 else // implies AVX512F 4232 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4235 __assert_unreachable<_Tp>(); 4243 template <typename _Abi> 4244 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4246 using _MaskImplX86Mixin::_S_to_bits; 4247 using _MaskImplX86Mixin::_S_to_maskvector; 4248 using _MaskImplBuiltin<_Abi>::_S_convert; 4251 template <typename _Tp> 4252 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4254 template <typename _Tp> 4255 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4257 template <typename _Tp> 4258 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4260 using _Base = _MaskImplBuiltin<_Abi>; 4264 template <typename _Tp> 4265 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4266 _S_broadcast(bool __x) 4268 if constexpr (__is_avx512_abi<_Abi>()) 4269 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4270 : _MaskMember<_Tp>(); 4272 return _Base::template _S_broadcast<_Tp>(__x); 4277 template <typename _Tp> 4278 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4279 _S_load(const bool* __mem) 4281 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4282 if constexpr (__have_avx512bw) 4284 const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) { 4285 if constexpr (__is_avx512_abi<_Abi>()) 4288 return _S_to_maskvector<_Tp>( 4289 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4292 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4295 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4296 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4298 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4301 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4302 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4304 else if constexpr (_S_size<_Tp> <= 64) 4307 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4308 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4311 else if constexpr (__is_avx512_abi<_Abi>()) 4313 if constexpr (_S_size<_Tp> <= 8) 4316 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4317 const auto __b = _mm512_cvtepi8_epi64(__a); 4318 return _mm512_test_epi64_mask(__b, __b); 4320 else if constexpr (_S_size<_Tp> <= 16) 4323 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4324 const auto __b = _mm512_cvtepi8_epi32(__a); 4325 return _mm512_test_epi32_mask(__b, __b); 4327 else if constexpr (_S_size<_Tp> <= 32) 4330 __builtin_memcpy(&__a, __mem, 16); 4331 const auto __b = _mm512_cvtepi8_epi32(__a); 4332 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4333 const auto __c = _mm512_cvtepi8_epi32(__a); 4334 return _mm512_test_epi32_mask(__b, __b) 4335 | (_mm512_test_epi32_mask(__c, __c) << 16); 4337 else if constexpr (_S_size<_Tp> <= 64) 4340 __builtin_memcpy(&__a, __mem, 16); 4341 const auto __b = _mm512_cvtepi8_epi32(__a); 4342 __builtin_memcpy(&__a, __mem + 16, 16); 4343 const auto __c = _mm512_cvtepi8_epi32(__a); 4344 if constexpr (_S_size<_Tp> <= 48) 4346 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4347 const auto __d = _mm512_cvtepi8_epi32(__a); 4348 return _mm512_test_epi32_mask(__b, __b) 4349 | (_mm512_test_epi32_mask(__c, __c) << 16) 4350 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4354 __builtin_memcpy(&__a, __mem + 16, 16); 4355 const auto __d = _mm512_cvtepi8_epi32(__a); 4356 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4357 const auto __e = _mm512_cvtepi8_epi32(__a); 4358 return _mm512_test_epi32_mask(__b, __b) 4359 | (_mm512_test_epi32_mask(__c, __c) << 16) 4360 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4361 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4365 __assert_unreachable<_Tp>(); 4367 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4368 return __vector_bitcast<_Tp>( 4369 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]), 4370 -int(__mem[1]), -int(__mem[1])}); 4371 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4374 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4375 const auto __k = __to_intrin( 4376 (__vector_broadcast<4>(__bool4) 4377 & __make_vector<int>(0x1, 0x100, 0x10000, 4378 _S_size<_Tp> == 4 ? 0x1000000 : 0)) 4380 return __vector_bitcast<_Tp>( 4381 __concat(_mm_unpacklo_epi32(__k, __k), 4382 _mm_unpackhi_epi32(__k, __k))); 4384 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4) 4387 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>); 4388 if constexpr (__have_sse2) 4390 __m128i __k = _mm_cvtsi32_si128(__bools); 4391 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4392 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4393 _mm_unpacklo_epi16(__k, __k)); 4397 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools)); 4399 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4400 _mm_cmpgt_ps(__k, __m128())); 4403 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8) 4406 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4407 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4408 return __vector_bitcast<_Tp>( 4409 __concat(_mm_unpacklo_epi16(__k, __k), 4410 _mm_unpackhi_epi16(__k, __k))); 4412 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16) 4415 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4416 __k = _mm_cmpgt_epi8(__k, __m128i()); 4417 if constexpr (_S_size<_Tp> <= 8) 4418 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4419 _mm_unpacklo_epi8(__k, __k)); 4421 return __concat(_mm_unpacklo_epi8(__k, __k), 4422 _mm_unpackhi_epi8(__k, __k)); 4425 return _Base::template _S_load<_Tp>(__mem); 4429 // _S_from_bitmask{{{ 4430 template <size_t _Np, typename _Tp> 4431 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 4432 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 4434 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4435 if constexpr (__is_avx512_abi<_Abi>()) 4436 return __bits._M_to_bits(); 4438 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); 4442 // _S_masked_load {{{2 4443 template <typename _Tp, size_t _Np> 4444 static inline _SimdWrapper<_Tp, _Np> 4445 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 4446 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 4448 if constexpr (__is_avx512_abi<_Abi>()) 4450 if constexpr (__have_avx512bw_vl) 4452 if constexpr (_Np <= 16) 4455 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem); 4456 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a); 4458 else if constexpr (_Np <= 32) 4461 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem); 4462 return (__merge & ~__mask) 4463 | _mm256_test_epi8_mask(__a, __a); 4465 else if constexpr (_Np <= 64) 4468 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem); 4469 return (__merge & ~__mask) 4470 | _mm512_test_epi8_mask(__a, __a); 4473 __assert_unreachable<_Tp>(); 4477 _BitOps::_S_bit_iteration(__mask, [&](auto __i) { 4478 __merge._M_set(__i, __mem[__i]); 4483 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1) 4485 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4486 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(), 4487 _mm256_mask_loadu_epi8(__m256i(), 4490 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1) 4492 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4494 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k, 4496 _mm_mask_loadu_epi8(__m128i(), __k, __mem)); 4498 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2) 4500 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4501 __merge = _mm256_mask_sub_epi16( 4502 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4503 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4505 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2) 4507 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4508 __merge = _mm_mask_sub_epi16( 4509 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4510 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4512 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4) 4514 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4515 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32( 4516 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4517 _mm256_cvtepi8_epi32( 4518 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4520 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4) 4522 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4523 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32( 4524 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4525 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4527 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8) 4529 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4530 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64( 4531 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4532 _mm256_cvtepi8_epi64( 4533 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4535 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8) 4537 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4538 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64( 4539 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4540 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4543 return _Base::_S_masked_load(__merge, __mask, __mem); 4548 template <typename _Tp, size_t _Np> 4549 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v, 4550 bool* __mem) noexcept 4552 if constexpr (__is_avx512_abi<_Abi>()) 4554 if constexpr (__have_avx512bw_vl) 4555 _CommonImplX86::_S_store<_Np>( 4556 __vector_bitcast<char>([](auto __data) { 4557 if constexpr (_Np <= 16) 4558 return _mm_maskz_set1_epi8(__data, 1); 4559 else if constexpr (_Np <= 32) 4560 return _mm256_maskz_set1_epi8(__data, 1); 4562 return _mm512_maskz_set1_epi8(__data, 1); 4565 else if constexpr (_Np <= 8) 4566 _CommonImplX86::_S_store<_Np>( 4567 __vector_bitcast<char>( 4568 #if defined __x86_64__ 4569 __make_wrapper<_ULLong>( 4570 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull) 4572 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U), 4573 _pdep_u32(__v._M_data >> 4, 4578 else if constexpr (_Np <= 16) 4579 _mm512_mask_cvtepi32_storeu_epi8( 4580 __mem, 0xffffu >> (16 - _Np), 4581 _mm512_maskz_set1_epi32(__v._M_data, 1)); 4583 __assert_unreachable<_Tp>(); 4585 else if constexpr (__is_sse_abi<_Abi>()) //{{{ 4587 if constexpr (_Np == 2 && sizeof(_Tp) == 8) 4589 const auto __k = __vector_bitcast<int>(__v); 4593 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 4595 if constexpr (__have_sse2) 4597 const unsigned __bool4 4598 = __vector_bitcast<_UInt>(_mm_packs_epi16( 4599 _mm_packs_epi32(__intrin_bitcast<__m128i>( 4604 __builtin_memcpy(__mem, &__bool4, _Np); 4606 else if constexpr (__have_mmx) 4608 const __m64 __k = _mm_cvtps_pi8( 4609 __and(__to_intrin(__v), _mm_set1_ps(1.f))); 4610 __builtin_memcpy(__mem, &__k, _Np); 4614 return _Base::_S_store(__v, __mem); 4616 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 4618 _CommonImplX86::_S_store<_Np>( 4619 __vector_bitcast<char>(_mm_packs_epi16( 4620 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15), 4624 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 4625 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem); 4627 __assert_unreachable<_Tp>(); 4629 else if constexpr (__is_avx_abi<_Abi>()) // {{{ 4631 if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 4633 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4635 if constexpr (__have_avx2) 4636 __bool4 = _mm256_movemask_epi8(__k); 4638 __bool4 = (_mm_movemask_epi8(__lo128(__k)) 4639 | (_mm_movemask_epi8(__hi128(__k)) << 16)); 4640 __bool4 &= 0x01010101; 4641 __builtin_memcpy(__mem, &__bool4, _Np); 4643 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4) 4645 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4647 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)), 4650 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i())); 4651 _CommonImplX86::_S_store<_Np>(__k3, __mem); 4653 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2) 4655 if constexpr (__have_avx2) 4657 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15); 4658 const auto __bools = __vector_bitcast<char>( 4659 _mm_packs_epi16(__lo128(__x), __hi128(__x))); 4660 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4666 & __vector_bitcast<_UChar>( 4667 _mm_packs_epi16(__lo128(__to_intrin(__v)), 4668 __hi128(__to_intrin(__v)))); 4669 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4672 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1) 4673 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem); 4675 __assert_unreachable<_Tp>(); 4678 __assert_unreachable<_Tp>(); 4681 // _S_masked_store {{{2 4682 template <typename _Tp, size_t _Np> 4684 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 4685 const _SimdWrapper<_Tp, _Np> __k) noexcept 4687 if constexpr (__is_avx512_abi<_Abi>()) 4689 static_assert(is_same_v<_Tp, bool>); 4690 if constexpr (_Np <= 16 && __have_avx512bw_vl) 4691 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1)); 4692 else if constexpr (_Np <= 16) 4693 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k, 4694 _mm512_maskz_set1_epi32(__v, 1)); 4695 else if constexpr (_Np <= 32 && __have_avx512bw_vl) 4696 _mm256_mask_storeu_epi8(__mem, __k, 4697 _mm256_maskz_set1_epi8(__v, 1)); 4698 else if constexpr (_Np <= 32 && __have_avx512bw) 4699 _mm256_mask_storeu_epi8(__mem, __k, 4700 __lo256(_mm512_maskz_set1_epi8(__v, 1))); 4701 else if constexpr (_Np <= 64 && __have_avx512bw) 4702 _mm512_mask_storeu_epi8(__mem, __k, 4703 _mm512_maskz_set1_epi8(__v, 1)); 4705 __assert_unreachable<_Tp>(); 4708 _Base::_S_masked_store(__v, __mem, __k); 4711 // logical and bitwise operators {{{2 4712 template <typename _Tp, size_t _Np> 4713 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4714 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, 4715 const _SimdWrapper<_Tp, _Np>& __y) 4717 if constexpr (is_same_v<_Tp, bool>) 4719 if constexpr (__have_avx512dq && _Np <= 8) 4720 return _kand_mask8(__x._M_data, __y._M_data); 4721 else if constexpr (_Np <= 16) 4722 return _kand_mask16(__x._M_data, __y._M_data); 4723 else if constexpr (__have_avx512bw && _Np <= 32) 4724 return _kand_mask32(__x._M_data, __y._M_data); 4725 else if constexpr (__have_avx512bw && _Np <= 64) 4726 return _kand_mask64(__x._M_data, __y._M_data); 4728 __assert_unreachable<_Tp>(); 4731 return _Base::_S_logical_and(__x, __y); 4734 template <typename _Tp, size_t _Np> 4735 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4736 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, 4737 const _SimdWrapper<_Tp, _Np>& __y) 4739 if constexpr (is_same_v<_Tp, bool>) 4741 if constexpr (__have_avx512dq && _Np <= 8) 4742 return _kor_mask8(__x._M_data, __y._M_data); 4743 else if constexpr (_Np <= 16) 4744 return _kor_mask16(__x._M_data, __y._M_data); 4745 else if constexpr (__have_avx512bw && _Np <= 32) 4746 return _kor_mask32(__x._M_data, __y._M_data); 4747 else if constexpr (__have_avx512bw && _Np <= 64) 4748 return _kor_mask64(__x._M_data, __y._M_data); 4750 __assert_unreachable<_Tp>(); 4753 return _Base::_S_logical_or(__x, __y); 4756 template <typename _Tp, size_t _Np> 4757 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4758 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 4760 if constexpr (is_same_v<_Tp, bool>) 4762 if constexpr (__have_avx512dq && _Np <= 8) 4763 return _kandn_mask8(__x._M_data, 4764 _Abi::template __implicit_mask_n<_Np>()); 4765 else if constexpr (_Np <= 16) 4766 return _kandn_mask16(__x._M_data, 4767 _Abi::template __implicit_mask_n<_Np>()); 4768 else if constexpr (__have_avx512bw && _Np <= 32) 4769 return _kandn_mask32(__x._M_data, 4770 _Abi::template __implicit_mask_n<_Np>()); 4771 else if constexpr (__have_avx512bw && _Np <= 64) 4772 return _kandn_mask64(__x._M_data, 4773 _Abi::template __implicit_mask_n<_Np>()); 4775 __assert_unreachable<_Tp>(); 4778 return _Base::_S_bit_not(__x); 4781 template <typename _Tp, size_t _Np> 4782 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4783 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, 4784 const _SimdWrapper<_Tp, _Np>& __y) 4786 if constexpr (is_same_v<_Tp, bool>) 4788 if constexpr (__have_avx512dq && _Np <= 8) 4789 return _kand_mask8(__x._M_data, __y._M_data); 4790 else if constexpr (_Np <= 16) 4791 return _kand_mask16(__x._M_data, __y._M_data); 4792 else if constexpr (__have_avx512bw && _Np <= 32) 4793 return _kand_mask32(__x._M_data, __y._M_data); 4794 else if constexpr (__have_avx512bw && _Np <= 64) 4795 return _kand_mask64(__x._M_data, __y._M_data); 4797 __assert_unreachable<_Tp>(); 4800 return _Base::_S_bit_and(__x, __y); 4803 template <typename _Tp, size_t _Np> 4804 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4805 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, 4806 const _SimdWrapper<_Tp, _Np>& __y) 4808 if constexpr (is_same_v<_Tp, bool>) 4810 if constexpr (__have_avx512dq && _Np <= 8) 4811 return _kor_mask8(__x._M_data, __y._M_data); 4812 else if constexpr (_Np <= 16) 4813 return _kor_mask16(__x._M_data, __y._M_data); 4814 else if constexpr (__have_avx512bw && _Np <= 32) 4815 return _kor_mask32(__x._M_data, __y._M_data); 4816 else if constexpr (__have_avx512bw && _Np <= 64) 4817 return _kor_mask64(__x._M_data, __y._M_data); 4819 __assert_unreachable<_Tp>(); 4822 return _Base::_S_bit_or(__x, __y); 4825 template <typename _Tp, size_t _Np> 4826 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4827 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, 4828 const _SimdWrapper<_Tp, _Np>& __y) 4830 if constexpr (is_same_v<_Tp, bool>) 4832 if constexpr (__have_avx512dq && _Np <= 8) 4833 return _kxor_mask8(__x._M_data, __y._M_data); 4834 else if constexpr (_Np <= 16) 4835 return _kxor_mask16(__x._M_data, __y._M_data); 4836 else if constexpr (__have_avx512bw && _Np <= 32) 4837 return _kxor_mask32(__x._M_data, __y._M_data); 4838 else if constexpr (__have_avx512bw && _Np <= 64) 4839 return _kxor_mask64(__x._M_data, __y._M_data); 4841 __assert_unreachable<_Tp>(); 4844 return _Base::_S_bit_xor(__x, __y); 4848 // _S_masked_assign{{{ 4849 template <size_t _Np> 4850 _GLIBCXX_SIMD_INTRINSIC static void 4851 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 4852 _SimdWrapper<bool, _Np>& __lhs, 4853 _SimdWrapper<bool, _Np> __rhs) 4856 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data); 4859 template <size_t _Np> 4860 _GLIBCXX_SIMD_INTRINSIC static void 4861 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 4862 _SimdWrapper<bool, _Np>& __lhs, bool __rhs) 4865 __lhs._M_data = __k._M_data | __lhs._M_data; 4867 __lhs._M_data = ~__k._M_data & __lhs._M_data; 4870 using _MaskImplBuiltin<_Abi>::_S_masked_assign; 4874 template <typename _Tp> 4875 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k) 4877 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 4879 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 4880 using _TI = __intrinsic_type_t<_Tp, _Np>; 4881 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 4882 if constexpr (__have_sse4_1) 4884 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 4885 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 4886 return 0 != __testc(__a, __b); 4888 else if constexpr (is_same_v<_Tp, float>) 4889 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) 4891 else if constexpr (is_same_v<_Tp, double>) 4892 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) 4895 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 4896 == (1 << (_Np * sizeof(_Tp))) - 1; 4898 else if constexpr (__is_avx512_abi<_Abi>()) 4900 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>(); 4901 const auto __kk = __k._M_data._M_data; 4902 if constexpr (sizeof(__kk) == 1) 4904 if constexpr (__have_avx512dq) 4905 return _kortestc_mask8_u8(__kk, _Mask == 0xff 4907 : __mmask8(~_Mask)); 4909 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask)); 4911 else if constexpr (sizeof(__kk) == 2) 4912 return _kortestc_mask16_u8(__kk, _Mask == 0xffff 4914 : __mmask16(~_Mask)); 4915 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw) 4916 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU 4918 : __mmask32(~_Mask)); 4919 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw) 4920 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL 4922 : __mmask64(~_Mask)); 4924 __assert_unreachable<_Tp>(); 4930 template <typename _Tp> 4931 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k) 4933 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 4935 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 4936 using _TI = __intrinsic_type_t<_Tp, _Np>; 4937 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 4938 if constexpr (__have_sse4_1) 4940 if constexpr (_Abi::template _S_is_partial< 4941 _Tp> || sizeof(__k) < 16) 4943 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 4944 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 4945 return 0 == __testz(__a, __b); 4948 return 0 == __testz(__a, __a); 4950 else if constexpr (is_same_v<_Tp, float>) 4951 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0; 4952 else if constexpr (is_same_v<_Tp, double>) 4953 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0; 4955 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 4958 else if constexpr (__is_avx512_abi<_Abi>()) 4959 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 4965 template <typename _Tp> 4966 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k) 4968 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 4970 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 4971 using _TI = __intrinsic_type_t<_Tp, _Np>; 4972 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 4973 if constexpr (__have_sse4_1) 4975 if constexpr (_Abi::template _S_is_partial< 4976 _Tp> || sizeof(__k) < 16) 4978 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 4979 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 4980 return 0 != __testz(__a, __b); 4983 return 0 != __testz(__a, __a); 4985 else if constexpr (is_same_v<_Tp, float>) 4986 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 4987 else if constexpr (is_same_v<_Tp, double>) 4988 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 4990 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1)) 4993 else if constexpr (__is_avx512_abi<_Abi>()) 4994 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5000 template <typename _Tp> 5001 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k) 5003 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5005 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5006 using _TI = __intrinsic_type_t<_Tp, _Np>; 5007 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5008 if constexpr (__have_sse4_1) 5010 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5011 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5012 return 0 != __testnzc(__a, __b); 5014 else if constexpr (is_same_v<_Tp, float>) 5016 constexpr int __allbits = (1 << _Np) - 1; 5017 const auto __tmp = _mm_movemask_ps(__a) & __allbits; 5018 return __tmp > 0 && __tmp < __allbits; 5020 else if constexpr (is_same_v<_Tp, double>) 5022 constexpr int __allbits = (1 << _Np) - 1; 5023 const auto __tmp = _mm_movemask_pd(__a) & __allbits; 5024 return __tmp > 0 && __tmp < __allbits; 5028 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1; 5029 const auto __tmp = _mm_movemask_epi8(__a) & __allbits; 5030 return __tmp > 0 && __tmp < __allbits; 5033 else if constexpr (__is_avx512_abi<_Abi>()) 5034 return _S_any_of(__k) && !_S_all_of(__k); 5036 __assert_unreachable<_Tp>(); 5041 template <typename _Tp> 5042 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k) 5044 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5045 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data; 5046 if constexpr (__is_avx512_abi<_Abi>()) 5048 if constexpr (_Np > 32) 5049 return __builtin_popcountll(__kk); 5051 return __builtin_popcount(__kk); 5055 if constexpr (__have_popcnt) 5058 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk))); 5059 const int __count = __builtin_popcount(__bits); 5060 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count; 5062 else if constexpr (_Np == 2 && sizeof(_Tp) == 8) 5064 const int mask = _mm_movemask_pd(__auto_bitcast(__kk)); 5065 return mask - (mask >> 1); 5067 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 5069 auto __x = -(__lo128(__kk) + __hi128(__kk)); 5070 return __x[0] + __x[1]; 5072 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 5074 if constexpr (__have_sse2) 5076 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk)); 5077 __x = _mm_add_epi32( 5078 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5079 __x = _mm_add_epi32( 5080 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2))); 5081 return -_mm_cvtsi128_si32(__x); 5084 return __builtin_popcount( 5085 _mm_movemask_ps(__auto_bitcast(__kk))); 5087 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 5089 auto __x = __to_intrin(__kk); 5090 __x = _mm_add_epi16(__x, 5091 _mm_shuffle_epi32(__x, 5092 _MM_SHUFFLE(0, 1, 2, 3))); 5093 __x = _mm_add_epi16( 5094 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5095 __x = _mm_add_epi16( 5096 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1))); 5097 return -short(_mm_extract_epi16(__x, 0)); 5099 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 5101 auto __x = __to_intrin(__kk); 5102 __x = _mm_add_epi8(__x, 5103 _mm_shuffle_epi32(__x, 5104 _MM_SHUFFLE(0, 1, 2, 3))); 5105 __x = _mm_add_epi8(__x, 5106 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 5108 __x = _mm_add_epi8(__x, 5109 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 5111 auto __y = -__vector_bitcast<_UChar>(__x); 5112 if constexpr (__have_sse4_1) 5113 return __y[0] + __y[1]; 5116 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0); 5117 return (__z & 0xff) + (__z >> 8); 5120 else if constexpr (sizeof(__kk) == 32) 5122 // The following works only as long as the implementations above 5124 using _I = __int_for_sizeof_t<_Tp>; 5125 const auto __as_int = __vector_bitcast<_I>(__kk); 5126 _MaskImplX86<simd_abi::__sse>::_S_popcount( 5127 simd_mask<_I, simd_abi::__sse>(__private_init, 5129 + __hi128(__as_int))); 5132 __assert_unreachable<_Tp>(); 5137 // _S_find_first_set {{{ 5138 template <typename _Tp> 5139 _GLIBCXX_SIMD_INTRINSIC static int 5140 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 5142 if constexpr (__is_avx512_abi<_Abi>()) 5143 return std::__countr_zero(__k._M_data._M_data); 5145 return _Base::_S_find_first_set(__k); 5149 // _S_find_last_set {{{ 5150 template <typename _Tp> 5151 _GLIBCXX_SIMD_INTRINSIC static int 5152 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 5154 if constexpr (__is_avx512_abi<_Abi>()) 5155 return std::__bit_width(__k._M_data._M_data) - 1; 5157 return _Base::_S_find_last_set(__k); 5165 _GLIBCXX_SIMD_END_NAMESPACE 5166 #endif // __cplusplus >= 201703L 5167 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 5169 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.