19#elif defined(__AVX2__)
32template<
typename T, std::
size_t Align>
41 static constexpr size_t alignment =
ALIGN;
51 __cpuid_count(7, 0, regs[0], regs[1], regs[2], regs[3]);
52 return (regs[1] & (1 << 16));
57 __cpuid_count(7, 0, regs[0], regs[1], regs[2], regs[3]);
58 return (regs[1] & (1 << 5));
63 __cpuid_count(1, 0, regs[0], regs[1], regs[2], regs[3]);
64 return (regs[3] & (1 << 25));
74 std::cout <<
"[dispatch] Detected AVX512\n";
77 std::cout <<
"[dispatch] Detected AVX2\n";
80 std::cout <<
"[dispatch] Detected SSE\n";
83 throw std::runtime_error(
"No supported SIMD ISA (SSE/AVX2/AVX512).");
88 alignas(64)
float tmp[16];
89 _mm512_store_ps(tmp, v);
91 return _mm256_load_ps(&tmp[0]);
93 return _mm256_load_ps(&tmp[8]);
102 __attribute__((always_inline, hot, flatten))
103 inline float reduce_sum(__m256 acc) {
104 __m128 low = _mm256_castps256_ps128(acc);
105 __m128 high = _mm256_extractf128_ps(acc, 1);
106 __m128 sum = _mm_add_ps(low, high);
107 sum = _mm_hadd_ps(sum, sum);
108 sum = _mm_hadd_ps(sum, sum);
109 return _mm_cvtss_f32(sum);
111 __attribute__((always_inline, hot, flatten))
112 inline double reduce_sum(__m256d acc) {
113 __m128d low = _mm256_castpd256_pd128(acc);
114 __m128d high = _mm256_extractf128_pd(acc, 1);
115 __m128d sum = _mm_add_pd(low, high);
117 _mm_store_pd(r, sum);
121 inline uint64_t reduce_sum(__m256i acc) {
122 __m128i low = _mm256_castsi256_si128(acc);
123 __m128i high = _mm256_extractf128_si256(acc, 1);
124 __m128i sum = _mm_add_epi64(low, high);
126 _mm_store_si128(
reinterpret_cast<__m128i*
>(r), sum);
130 inline float reduce_sum(__m512 acc) {
131 __m256 low = _mm512_castps512_ps256(acc);
132#if defined(__AVX512DQ__)
133 __m256 high = _mm512_extractf32x8_ps(acc, 1);
137 __m256 sum = _mm256_add_ps(low, high);
138 return reduce_sum(sum);
141 inline double reduce_sum(__m512d acc) {
142 __m256d low = _mm512_castpd512_pd256(acc);
143 __m256d high = _mm512_extractf64x4_pd(acc, 1);
144 __m256d sum = _mm256_add_pd(low, high);
145 return reduce_sum(sum);
148 inline uint64_t reduce_sum(__m512i acc) {
149 __m256i low = _mm512_castsi512_si256(acc);
150 __m256i high = _mm512_extracti64x4_epi64(acc, 1);
151 __m256i sum = _mm256_add_epi64(low, high);
152 return reduce_sum(sum);
155 inline float reduce_sum(__m128 acc) {
156 __m128 sum = _mm_hadd_ps(acc, acc);
157 sum = _mm_hadd_ps(sum, sum);
158 return _mm_cvtss_f32(sum);
163static inline __m512 andnot_fallback(__m512 a, __m512 b) {
164 __m512i a_bits = _mm512_castps_si512(a);
165 __m512i not_a_bits = _mm512_xor_si512(a_bits, _mm512_set1_epi32(-1));
166 __m512i b_bits = _mm512_castps_si512(b);
167 __m512i result_bits = _mm512_and_si512(not_a_bits, b_bits);
168 return _mm512_castsi512_ps(result_bits);
176 template<
typename T,
typename ISA = DefaultISA>
181 static constexpr size_t width = 4;
182 static constexpr size_t alignment = 16;
184 static inline reg set1(
float x) {
return _mm_set1_ps(x); }
185 static inline reg set(
float a,
float b,
float c,
float d) {
186 return _mm_set_ps(a, b, c, d);
188 template<
int i0,
int i1,
int i2,
int i3>
190 return _mm_permute_ps(x, _MM_SHUFFLE(i3, i2, i1, i0));
193 float a0,
float a1,
float a2,
float a3
195 return _mm_set_ps(a3, a2, a1, a0);
198 alignas(16)
float tmp[4];
199 _mm_store_ps(tmp, value);
200 alignas(16)
int m[4];
201 _mm_store_si128(
reinterpret_cast<__m128i*
>(m), _mm_castps_si128(mask));
202 for (
int i = 0; i < 4; ++i)
208 alignas(16)
float values[4];
209 _mm_storeu_ps(values, x);
210 return values[index];
212 static inline void stream(
float* ptr,
reg x) { _mm_stream_ps(ptr, x); }
216 alignas(16)
float values[4];
217 _mm_store_ps(values, v);
218 return values[0] + values[1] + values[2] + values[3];
220 static inline reg load(
const float* ptr) {
return _mm_load_ps(ptr); }
221 static inline reg loadu(
const float* ptr) {
return _mm_loadu_ps(ptr); }
222 static inline void store(
float* ptr,
reg x) { _mm_store_ps(ptr, x); }
223 static inline void storeu(
float* ptr,
reg x) { _mm_storeu_ps(ptr, x); }
224 static inline reg zero() {
return _mm_setzero_ps(); }
238 static constexpr size_t alignment = 16;
240 static constexpr size_t width = 2;
241 static inline reg set1(
double x) {
return _mm_set1_pd(x); }
242 static inline reg set(
double a,
double b) {
return _mm_set_pd(b, a); }
244 alignas(16)
double values[2];
245 _mm_storeu_pd(values, x);
246 return values[index];
248 static inline void stream(
double* ptr,
reg x) { _mm_stream_pd(ptr, x); }
251 alignas(16)
double values[2];
252 _mm_store_pd(values, v);
253 return values[0] + values[1];
255 static inline void maskstore(
double* ptr, __m128i mask, __m128d value) {
256 alignas(16)
double tmp[2];
257 _mm_store_pd(tmp, value);
258 alignas(16)
int m[2];
259 _mm_store_si128(
reinterpret_cast<__m128i*
>(m), mask);
260 for (
int i = 0; i < 2; ++i)
265 static inline reg load(
const double* ptr) {
return _mm_load_pd(ptr); }
266 static inline reg loadu(
const double* ptr) {
return _mm_loadu_pd(ptr); }
267 static inline void store(
double* ptr,
reg x) { _mm_store_pd(ptr, x); }
268 static inline void storeu(
double* ptr,
reg x) { _mm_storeu_pd(ptr, x); }
269 static inline reg zero() {
return _mm_setzero_pd(); }
283 static constexpr size_t width = 2;
284 static constexpr size_t alignment = 16;
286 static inline reg set1(uint64_t x) {
return _mm_set1_epi64x(x); }
287 static inline reg set(uint64_t a, uint64_t b) {
return _mm_set_epi64x(b, a); }
289 alignas(16) uint64_t values[2];
290 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(values), x);
291 return values[index];
293 static inline void stream(uint64_t* ptr,
reg x) { _mm_stream_si128(
reinterpret_cast<__m128i*
>(ptr), x); }
296 alignas(16) uint64_t values[2];
297 _mm_store_si128(
reinterpret_cast<__m128i*
>(values), v);
298 return values[0] + values[1];
300 static inline reg load(
const uint64_t* ptr) {
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(ptr)); }
301 static inline reg loadu(
const uint64_t* ptr) {
return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(ptr)); }
302 static inline void store(uint64_t* ptr,
reg x) { _mm_store_si128(
reinterpret_cast<__m128i*
>(ptr), x); }
303 static inline void storeu(uint64_t* ptr,
reg x) { _mm_storeu_si128(
reinterpret_cast<__m128i*
>(ptr), x); }
304 static inline reg zero() {
return _mm_setzero_si128(); }
307 alignas(16)
size_t lhs[2], rhs[2], out[2];
308 _mm_store_si128((__m128i*)lhs, a);
309 _mm_store_si128((__m128i*)rhs, b);
310 for (
size_t i = 0; i < 2; ++i)
311 out[i] = lhs[i] * rhs[i];
312 return _mm_load_si128((__m128i*)out);
317 static inline void store_stream(uint64_t* ptr,
reg x) { _mm_stream_si128(
reinterpret_cast<__m128i*
>(ptr), x); }
318 static inline reg set_epi64(int64_t a, int64_t b) {
return _mm_set_epi64x(b, a); }
326 static constexpr size_t width = 8;
327 static constexpr size_t alignment = 32;
329 static inline reg set1(
float x) {
return _mm256_set1_ps(x); }
330 static inline reg set(
float a,
float b,
float c,
float d) {
331 return _mm256_set_ps(a, b, c, d, a, b, c, d);
333 static inline void maskstore(
float* ptr, __m256i mask, __m256 value) {
334 _mm256_maskstore_ps(ptr, mask, value);
336 template<
int i0,
int i1,
int i2,
int i3>
338 constexpr int imm = _MM_SHUFFLE(i3, i2, i1, i0);
339 return _mm256_permute_ps(x, imm);
342 float a0,
float a1,
float a2,
float a3,
343 float a4,
float a5,
float a6,
float a7
345 return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
348 alignas(32)
float values[8];
349 _mm256_storeu_ps(values, x);
350 return values[index];
353 return _mm256_maskload_ps(ptr, m);
356 return _mm256_broadcast_ss(ptr);
358 static inline void stream(
float* ptr,
reg x) { _mm256_stream_ps(ptr, x); }
362 static inline reg load(
const float* ptr) {
return _mm256_load_ps(ptr); }
363 static inline reg loadu(
const float* ptr) {
return _mm256_loadu_ps(ptr); }
364 static inline void store(
float* ptr,
reg x) { _mm256_store_ps(ptr, x); }
365 static inline void storeu(
float* ptr,
reg x) { _mm256_storeu_ps(ptr, x); }
366 static inline reg zero() {
return _mm256_setzero_ps(); }
381 static constexpr size_t width = 4;
382 static constexpr size_t alignment = 32;
384 static inline reg set1(
double x) {
return _mm256_set1_pd(x); }
385 static inline reg set(
double a,
double b,
double c,
double d) {
386 return _mm256_set_pd(a, b, c, d);
389 alignas(32)
double values[4];
390 _mm256_storeu_pd(values, x);
391 return values[index];
393 static inline void maskstore(
double* ptr, __m256i mask, __m256d value) {
394 _mm256_maskstore_pd(ptr, mask, value);
397 return _mm256_maskload_pd(ptr, m);
400 return _mm256_broadcast_sd(ptr);
402 static inline void stream(
double* ptr,
reg x) { _mm256_stream_pd(ptr, x); }
405 static inline reg load(
const double* ptr) {
return _mm256_load_pd(ptr); }
406 static inline reg loadu(
const double* ptr) {
return _mm256_loadu_pd(ptr); }
407 static inline void store(
double* ptr,
reg x){ _mm256_store_pd(ptr, x); }
408 static inline void storeu(
double* ptr,
reg x){ _mm256_storeu_pd(ptr, x); }
409 static inline reg zero() {
return _mm256_setzero_pd(); }
422 static constexpr size_t width = 4;
423 static constexpr size_t alignment = 32;
425 static inline reg set1(uint64_t x) {
return _mm256_set1_epi64x(x); }
426 static inline reg set(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
427 return _mm256_set_epi64x(a, b, c, d);
430 alignas(32) uint64_t values[4];
431 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(values), x);
432 return values[index];
434 static inline void stream(uint64_t* ptr,
reg x) { _mm256_stream_si256(
reinterpret_cast<__m256i*
>(ptr), x); }
435 static inline reg setzero() {
return _mm256_setzero_si256(); }
439 static_assert(
sizeof(size_t) ==
sizeof(uint64_t),
440 "SIMD::load(size_t*) requires 64-bit size_t");
441 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(ptr));
443 static inline reg loadu(
const uint64_t* ptr) {
return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(ptr)); }
444 static inline void store(uint64_t* ptr,
reg x) { _mm256_store_si256(
reinterpret_cast<__m256i*
>(ptr), x); }
445 static inline void storeu(uint64_t* ptr,
reg x) { _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(ptr), x); }
447 alignas(32)
size_t lhs[4], rhs[4], out[4];
448 _mm256_store_si256((__m256i*)lhs, a);
449 _mm256_store_si256((__m256i*)rhs, b);
450 for (
size_t i = 0; i < 4; ++i)
451 out[i] = lhs[i] * rhs[i];
452 return _mm256_load_si256((__m256i*)out);
454 static inline reg zero() {
return _mm256_setzero_si256(); }
459 static inline void store_stream(uint64_t* ptr,
reg x) { _mm256_stream_si256(
reinterpret_cast<__m256i*
>(ptr), x); }
460 static inline reg set_epi64(int64_t a, int64_t b, int64_t c, int64_t d) {
461 return _mm256_set_epi64x(a, b, c, d);
468 struct SimdTraits<float,
avx512_t> {
470 static constexpr size_t width = 16;
471 static constexpr size_t alignment = 64;
473 static inline reg set1(
float x) {
return _mm512_set1_ps(x); }
474 static inline reg set(
float a,
float b,
float c,
float d,
475 float e,
float f,
float g,
float h,
476 float i,
float j,
float k,
float l,
477 float m,
float n,
float o,
float p) {
478 return _mm512_set_ps(a, b, c, d, e, f, g, h,
479 i, j, k, l, m, n, o, p);
481 static inline float extract(reg x,
size_t index) {
482 alignas(64)
float values[16];
483 _mm512_storeu_ps(values, x);
484 return values[index];
486 static inline void stream(
float* ptr, reg x) { _mm512_stream_ps(ptr, x); }
487 static inline reg setzero() {
return _mm512_setzero_ps(); }
488 static inline float horizontal_add(reg v) {
return detail::reduce_sum(v); }
489 static inline reg load(
const float* ptr) {
return _mm512_load_ps(ptr); }
490 static inline reg loadu(
const float* ptr) {
return _mm512_loadu_ps(ptr); }
491 static inline void store(
float* ptr, reg x) { _mm512_store_ps(ptr, x); }
492 static inline reg loadu_stream(
const float* ptr) {
return _mm512_loadu_ps(ptr); }
493 static inline reg zero() {
return _mm512_setzero_ps(); }
494 static inline reg fmadd(reg a, reg b, reg c){
return _mm512_fmadd_ps(a, b, c); }
495 static inline reg add(reg a, reg b) {
return _mm512_add_ps(a, b); }
496 static inline reg mul(reg a, reg b) {
return _mm512_mul_ps(a, b); }
497 static inline reg sub(reg a, reg b) {
return _mm512_sub_ps(a, b); }
498#if defined(__AVX512DQ__)
499 static inline reg andnot(reg a, reg b) {
return _mm512_andnot_ps(a, b); }
501 static inline reg andnot(reg a, reg b) {
return andnot_fallback(a, b); }
503 static inline void store_stream(
float* ptr, reg x) { _mm512_stream_ps(ptr, x); }
504 static inline reg max(reg a, reg b) {
return _mm512_max_ps(a, b); }
508 struct SimdTraits<double,
avx512_t> {
510 static constexpr size_t width = 8;
511 static constexpr size_t alignment = 64;
513 static inline reg set1(
double x) {
return _mm512_set1_pd(x); }
514 static inline reg set(
double a,
double b,
double c,
double d,
515 double e,
double f,
double g,
double h) {
516 return _mm512_set_pd(a, b, c, d, e, f, g, h);
518 static inline double extract(reg x,
size_t index) {
519 alignas(64)
double values[8];
520 _mm512_storeu_pd(values, x);
521 return values[index];
523 static inline void stream(
double* ptr, reg x) { _mm512_stream_pd(ptr, x); }
524 static inline reg setzero() {
return _mm512_setzero_pd(); }
525 static inline float horizontal_add(reg v) {
return detail::reduce_sum(v); }
526 static inline reg load(
const double* ptr) {
return _mm512_load_pd(ptr); }
527 static inline reg loadu(
const double* ptr) {
return _mm512_loadu_pd(ptr); }
528 static inline void store(
double* ptr, reg x){ _mm512_store_pd(ptr, x); }
529 static inline reg loadu_stream(
const double* ptr) {
return _mm512_loadu_pd(ptr); }
530 static inline reg zero() {
return _mm512_setzero_pd(); }
531 static inline reg fmadd(reg a, reg b, reg c){
return _mm512_fmadd_pd(a, b, c); }
532 static inline reg add(reg a, reg b) {
return _mm512_add_pd(a, b); }
533 static inline reg mul(reg a, reg b) {
return _mm512_mul_pd(a, b); }
534 static inline reg sub(reg a, reg b) {
return _mm512_sub_pd(a, b); }
535 static inline reg andnot(reg a, reg b) {
return _mm512_andnot_pd(a, b); }
536 static inline void store_stream(
double* ptr, reg x) { _mm512_stream_pd(ptr, x); }
537 static inline reg max(reg a, reg b) {
return _mm512_max_pd(a, b); }
541 struct SimdTraits<size_t,
avx512_t> {
543 static constexpr size_t width = 8;
544 static constexpr size_t alignment = 64;
546 static inline reg set1(
size_t x) {
return _mm512_set1_epi64(x); }
547 static inline reg set(
size_t a,
size_t b,
size_t c,
size_t d,
548 size_t e,
size_t f,
size_t g,
size_t h) {
549 return _mm512_set_epi64(a, b, c, d, e, f, g, h);
551 static inline size_t extract(reg x,
size_t index) {
552 alignas(64)
size_t values[8];
553 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(values), x);
554 return values[index];
556 static inline reg setzero() {
return _mm512_setzero_si512(); }
557 static inline float horizontal_add(reg v) {
return detail::reduce_sum(v); }
558 static inline reg load(
const size_t* ptr) {
return _mm512_load_si512(
reinterpret_cast<const __m512i*
>(ptr)); }
559 static inline reg loadu(
const size_t* ptr) {
return _mm512_loadu_si512(
reinterpret_cast<const __m512i*
>(ptr)); }
560 static inline void store(
size_t* ptr, reg x) { _mm512_store_si512(
reinterpret_cast<__m512i*
>(ptr), x); }
561 static inline void storeu(
size_t* ptr, reg x) { _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(ptr), x); }
562 static inline reg zero() {
return _mm512_setzero_si512(); }
563 static inline reg add(reg a, reg b) {
return _mm512_add_epi64(a, b); }
564#if defined(__AVX512DQ__)
565 static inline reg mul(reg a, reg b) {
566 return _mm512_mullo_epi64(a, b);
569 static inline reg mul(reg a, reg b) {
570 alignas(64) uint64_t A[8], B[8], R[8];
571 _mm512_store_epi64(A, a);
572 _mm512_store_epi64(B, b);
573 for (
int i = 0; i < 8; ++i) R[i] = A[i] * B[i];
574 return _mm512_load_epi64(R);
578 static inline reg fmadd(reg a, reg b, reg c){
return _mm512_add_epi64(mul(a, b), c); }
579 static inline reg sub(reg a, reg b) {
return _mm512_sub_epi64(a, b); }
580 static inline reg andnot(reg a, reg b) {
return _mm512_andnot_si512(a, b); }
581 static inline void store_stream(
size_t* ptr, reg x) { _mm512_stream_si512(
reinterpret_cast<__m512i*
>(ptr), x); }
582 static inline reg set_epi64(int64_t a, int64_t b, int64_t c, int64_t d, int64_t e, int64_t f, int64_t g, int64_t h) {
583 return _mm512_set_epi64(a, b, c, d, e, f, g, h);
585 static inline reg max(reg a, reg b) {
return _mm512_max_epi64(a, b); }
595 static constexpr size_t width = 2;
596 static constexpr size_t alignment = 16;
598 static inline reg set(std::complex<float> a, std::complex<float> b) {
599 return _mm_set_ps(b.imag(), a.real(), b.real(), a.imag());
601 static inline reg set1(std::complex<float> x) {
602 return _mm_set_ps(x.imag(), x.real(), x.imag(), x.real());
604 static inline reg load(
const std::complex<float>* ptr) {
605 return _mm_loadu_ps(
reinterpret_cast<const float*
>(ptr));
607 static inline std::complex<float>
extract(
reg x,
size_t index) {
608 alignas(16)
float values[4];
609 _mm_storeu_ps(values, x);
610 return std::complex<float>(values[2 * index], values[2 * index + 1]);
612 static inline reg loadu(
const std::complex<float>* ptr) {
613 return _mm_loadu_ps(
reinterpret_cast<const float*
>(ptr));
615 static inline void store(std::complex<float>* ptr,
reg x) {
616 _mm_storeu_ps(
reinterpret_cast<float*
>(ptr), x);
618 static inline void storeu(std::complex<float>* ptr,
reg x) {
619 _mm_storeu_ps(
reinterpret_cast<float*
>(ptr), x);
624 __m128 a_real = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,2,0,0));
625 __m128 a_imag = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3,3,1,1));
626 __m128 b_real = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,2,0,0));
627 __m128 b_imag = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3,3,1,1));
629 __m128
real = _mm_sub_ps(_mm_mul_ps(a_real, b_real), _mm_mul_ps(a_imag, b_imag));
630 __m128 imag = _mm_add_ps(_mm_mul_ps(a_real, b_imag), _mm_mul_ps(a_imag, b_real));
632 return _mm_unpacklo_ps(
real, imag);
640 alignas(16)
float values[4];
641 _mm_storeu_ps(values, v);
642 return std::complex<float>(values[0] + values[2], values[1] + values[3]);
644 static inline void stream(std::complex<float>* ptr,
reg x) {
645 _mm_stream_ps(
reinterpret_cast<float*
>(ptr), x);
648 _mm_stream_ps(
reinterpret_cast<float*
>(ptr), x);
655 static constexpr size_t width = 1;
656 static constexpr size_t alignment = 16;
658 static inline reg set(std::complex<double> a, std::complex<double> b) {
659 return _mm_set_pd(b.imag(), a.real());
661 static inline reg set1(std::complex<double> x) {
662 return _mm_set_pd(x.imag(), x.real());
664 static inline reg load(
const std::complex<double>* ptr) {
665 return _mm_loadu_pd(
reinterpret_cast<const double*
>(ptr));
667 static inline reg loadu(
const std::complex<double>* ptr) {
668 return _mm_loadu_pd(
reinterpret_cast<const double*
>(ptr));
670 static inline std::complex<double>
extract(
reg x,
size_t = 0) {
671 alignas(16)
double values[2];
672 _mm_storeu_pd(values, x);
673 return std::complex<double>(values[0], values[1]);
675 static inline void store(std::complex<double>* ptr,
reg x) {
676 _mm_storeu_pd(
reinterpret_cast<double*
>(ptr), x);
678 static inline void storeu(std::complex<double>* ptr,
reg x) {
679 _mm_storeu_pd(
reinterpret_cast<double*
>(ptr), x);
684 __m128d a_real = _mm_unpacklo_pd(a, a);
685 __m128d a_imag = _mm_unpackhi_pd(a, a);
686 __m128d b_real = _mm_unpacklo_pd(b, b);
687 __m128d b_imag = _mm_unpackhi_pd(b, b);
689 __m128d
real = _mm_sub_pd(_mm_mul_pd(a_real, b_real), _mm_mul_pd(a_imag, b_imag));
690 __m128d imag = _mm_add_pd(_mm_mul_pd(a_real, b_imag), _mm_mul_pd(a_imag, b_real));
692 return _mm_unpacklo_pd(
real, imag);
695 reg result = mul(a, b);
696 return _mm_add_pd(result, c);
707 static constexpr size_t width = 8;
708 static constexpr size_t alignment = 32;
710 static inline reg set(std::complex<float> a, std::complex<float> b,
711 std::complex<float> c, std::complex<float> d) {
712 return _mm256_set_ps(
719 static inline reg set1(std::complex<float> x) {
720 return _mm256_set_ps(x.imag(), x.real(), x.imag(), x.real(),
721 x.imag(), x.real(), x.imag(), x.real());
723 static inline reg load(
const std::complex<float>* ptr) {
724 return _mm256_loadu_ps(
reinterpret_cast<const float*
>(ptr));
726 static inline reg loadu(
const std::complex<float>* ptr) {
727 return _mm256_loadu_ps(
reinterpret_cast<const float*
>(ptr));
729 static inline std::complex<float>
extract(
reg x,
size_t index) {
730 alignas(32)
float values[8];
731 _mm256_storeu_ps(values, x);
732 return std::complex<float>(values[2 * index], values[2 * index + 1]);
735 float re = ptr->real();
736 float im = ptr->imag();
737 return _mm256_set_ps(im, re, im, re, im, re, im, re);
740 static inline void store(std::complex<float>* ptr,
reg x) {
741 _mm256_store_ps(
reinterpret_cast<float*
>(ptr), x);
743 static inline void storeu(std::complex<float>* ptr,
reg x) {
744 _mm256_storeu_ps(
reinterpret_cast<float*
>(ptr), x);
746 static inline void stream(std::complex<float>* ptr,
reg x) {
747 _mm256_stream_ps(
reinterpret_cast<float*
>(ptr), x);
752 __m256 a_real = _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2,0,2,0));
753 __m256 a_imag = _mm256_shuffle_ps(a, a, _MM_SHUFFLE(3,1,3,1));
754 __m256 b_real = _mm256_shuffle_ps(b, b, _MM_SHUFFLE(2,0,2,0));
755 __m256 b_imag = _mm256_shuffle_ps(b, b, _MM_SHUFFLE(3,1,3,1));
757 __m256
real = _mm256_sub_ps(_mm256_mul_ps(a_real, b_real), _mm256_mul_ps(a_imag, b_imag));
758 __m256 imag = _mm256_add_ps(_mm256_mul_ps(a_real, b_imag), _mm256_mul_ps(a_imag, b_real));
760 __m256 result = _mm256_unpacklo_ps(
real, imag);
761 __m256 result_high = _mm256_unpackhi_ps(
real, imag);
763 return _mm256_permute2f128_ps(result, result_high, 0x20);
766 reg result = mul(a, b);
767 return _mm256_add_ps(result, c);
773 static inline reg zero() {
return _mm256_setzero_ps(); }
776 alignas(32)
float values[8];
777 _mm256_storeu_ps(values, v);
778 return std::complex<float>(
779 values[0] + values[2] + values[4] + values[6],
780 values[1] + values[3] + values[5] + values[7]
783 static inline reg maskload(
const std::complex<float>* ptr, __m256i mask) {
784 return _mm256_maskload_ps(
reinterpret_cast<const float*
>(ptr), mask);
786 static inline void maskstore(std::complex<float>* ptr, __m256i mask,
reg v) {
787 _mm256_maskstore_ps(
reinterpret_cast<float*
>(ptr), mask, v);
794 static constexpr size_t width = 2;
795 static constexpr size_t alignment = 32;
797 static inline reg set(std::complex<double> a, std::complex<double> b) {
798 return _mm256_set_pd(
803 static inline reg set1(std::complex<double> x) {
804 return _mm256_set_pd(x.imag(), x.real(), x.imag(), x.real());
806 static inline reg load(
const std::complex<double>* ptr) {
807 return _mm256_loadu_pd(
reinterpret_cast<const double*
>(ptr));
809 static inline reg loadu(
const std::complex<double>* ptr) {
810 return _mm256_loadu_pd(
reinterpret_cast<const double*
>(ptr));
812 static inline void store(std::complex<double>* ptr,
reg x) {
813 _mm256_storeu_pd(
reinterpret_cast<double*
>(ptr), x);
815 static inline void storeu(std::complex<double>* ptr,
reg x) {
816 _mm256_storeu_pd(
reinterpret_cast<double*
>(ptr), x);
819 double re = ptr->real();
820 double im = ptr->imag();
821 return _mm256_set_pd(im, re, im, re);
826 __m128d a_lo = _mm256_castpd256_pd128(a);
827 __m128d a_hi = _mm256_extractf128_pd(a, 1);
828 __m128d b_lo = _mm256_castpd256_pd128(b);
829 __m128d b_hi = _mm256_extractf128_pd(b, 1);
831 __m128d a_lo_real = _mm_unpacklo_pd(a_lo, a_lo);
832 __m128d a_lo_imag = _mm_unpackhi_pd(a_lo, a_lo);
833 __m128d b_lo_real = _mm_unpacklo_pd(b_lo, b_lo);
834 __m128d b_lo_imag = _mm_unpackhi_pd(b_lo, b_lo);
836 __m128d real_lo = _mm_sub_pd(_mm_mul_pd(a_lo_real, b_lo_real), _mm_mul_pd(a_lo_imag, b_lo_imag));
837 __m128d imag_lo = _mm_add_pd(_mm_mul_pd(a_lo_real, b_lo_imag), _mm_mul_pd(a_lo_imag, b_lo_real));
838 __m128d result_lo = _mm_unpacklo_pd(real_lo, imag_lo);
840 __m128d a_hi_real = _mm_unpacklo_pd(a_hi, a_hi);
841 __m128d a_hi_imag = _mm_unpackhi_pd(a_hi, a_hi);
842 __m128d b_hi_real = _mm_unpacklo_pd(b_hi, b_hi);
843 __m128d b_hi_imag = _mm_unpackhi_pd(b_hi, b_hi);
845 __m128d real_hi = _mm_sub_pd(_mm_mul_pd(a_hi_real, b_hi_real), _mm_mul_pd(a_hi_imag, b_hi_imag));
846 __m128d imag_hi = _mm_add_pd(_mm_mul_pd(a_hi_real, b_hi_imag), _mm_mul_pd(a_hi_imag, b_hi_real));
847 __m128d result_hi = _mm_unpacklo_pd(real_hi, imag_hi);
849 return _mm256_insertf128_pd(_mm256_castpd128_pd256(result_lo), result_hi, 1);
852 reg result = mul(a, b);
853 return _mm256_add_pd(result, c);
861 alignas(32)
double values[4];
862 _mm256_storeu_pd(values, v);
863 return std::complex<double>(
864 values[0] + values[2],
865 values[1] + values[3]
868 static inline reg maskload(
const std::complex<double>* ptr, __m256i mask) {
869 return _mm256_maskload_pd(
reinterpret_cast<const double*
>(ptr), mask);
871 static inline void maskstore(std::complex<double>* ptr, __m256i mask,
reg v) {
872 _mm256_maskstore_pd(
reinterpret_cast<double*
>(ptr), mask, v);
879 struct SimdTraits<std::complex<float>,
avx512_t> {
881 static constexpr size_t width = 8;
882 static constexpr size_t alignment = 32;
884 static inline reg set(std::complex<float> a, std::complex<float> b,
885 std::complex<float> c, std::complex<float> d,
886 std::complex<float> e, std::complex<float> f,
887 std::complex<float> g, std::complex<float> h) {
888 return _mm512_set_ps(
900 static inline reg set1(std::complex<float> x) {
901 return _mm512_set_ps(
902 x.imag(), x.real(), x.imag(), x.real(),
903 x.imag(), x.real(), x.imag(), x.real(),
904 x.imag(), x.real(), x.imag(), x.real(),
905 x.imag(), x.real(), x.imag(), x.real()
909 static inline reg load(
const std::complex<float>* ptr) {
910 return _mm512_loadu_ps(
reinterpret_cast<const float*
>(ptr));
913 static inline reg loadu(
const std::complex<float>* ptr) {
914 return _mm512_loadu_ps(
reinterpret_cast<const float*
>(ptr));
917 static inline void store(std::complex<float>* ptr, reg x) {
918 _mm512_store_ps(
reinterpret_cast<float*
>(ptr), x);
921 static inline void storeu(std::complex<float>* ptr, reg x) {
922 _mm512_storeu_ps(
reinterpret_cast<float*
>(ptr), x);
925 static inline void stream(std::complex<float>* ptr, reg x) {
926 _mm512_stream_ps(
reinterpret_cast<float*
>(ptr), x);
929 static inline reg add(reg a, reg b) {
return _mm512_add_ps(a, b); }
930 static inline reg sub(reg a, reg b) {
return _mm512_sub_ps(a, b); }
932 static inline reg mul(reg a, reg b) {
933 __m512 a_real = _mm512_shuffle_ps(a, a, _MM_SHUFFLE(2,0,2,0));
934 __m512 a_imag = _mm512_shuffle_ps(a, a, _MM_SHUFFLE(3,1,3,1));
935 __m512 b_real = _mm512_shuffle_ps(b, b, _MM_SHUFFLE(2,0,2,0));
936 __m512 b_imag = _mm512_shuffle_ps(b, b, _MM_SHUFFLE(3,1,3,1));
938 __m512
real = _mm512_sub_ps(_mm512_mul_ps(a_real, b_real), _mm512_mul_ps(a_imag, b_imag));
939 __m512 imag = _mm512_add_ps(_mm512_mul_ps(a_real, b_imag), _mm512_mul_ps(a_imag, b_real));
941 return _mm512_unpacklo_ps(
real, imag);
944 static inline reg fma(reg a, reg b, reg c) {
945#if defined(__AVX512F__) && defined(__FMA__)
946 return _mm512_fmadd_ps(a, b, c);
948 reg result = mul(a, b);
949 return _mm512_add_ps(result, c);
953 static inline reg setzero() {
return _mm512_setzero_ps(); }
954 static inline reg zero() {
return _mm512_setzero_ps(); }
955 static inline reg andnot(reg a, reg b) {
return _mm512_andnot_ps(a, b); }
956 static inline reg max(reg a, reg b) {
return _mm512_max_ps(a, b); }
957 static inline reg min(reg a, reg b) {
return _mm512_min_ps(a, b); }
959 static inline std::complex<float> horizontal_add(reg v) {
960 alignas(64)
float values[16];
961 _mm512_storeu_ps(values, v);
962 return std::complex<float>(
963 values[0] + values[2] + values[4] + values[6] +
964 values[8] + values[10] + values[12] + values[14],
965 values[1] + values[3] + values[5] + values[7] +
966 values[9] + values[11] + values[13] + values[15]
972 struct SimdTraits<std::complex<double>,
avx512_t> {
974 static constexpr size_t width = 4;
975 static constexpr size_t alignment = 32;
977 static inline reg set(std::complex<double> a, std::complex<double> b,
978 std::complex<double> c, std::complex<double> d) {
979 return _mm512_set_pd(
987 static inline reg set1(std::complex<double> x) {
988 return _mm512_set_pd(
989 x.imag(), x.real(), x.imag(), x.real(),
990 x.imag(), x.real(), x.imag(), x.real()
994 static inline reg load(
const std::complex<double>* ptr) {
995 return _mm512_loadu_pd(
reinterpret_cast<const double*
>(ptr));
998 static inline reg loadu(
const std::complex<double>* ptr) {
999 return _mm512_loadu_pd(
reinterpret_cast<const double*
>(ptr));
1002 static inline void store(std::complex<double>* ptr, reg x) {
1003 _mm512_store_pd(
reinterpret_cast<double*
>(ptr), x);
1006 static inline void storeu(std::complex<double>* ptr, reg x) {
1007 _mm512_storeu_pd(
reinterpret_cast<double*
>(ptr), x);
1010 static inline void stream(std::complex<double>* ptr, reg x) {
1011 _mm512_stream_pd(
reinterpret_cast<double*
>(ptr), x);
1014 static inline reg add(reg a, reg b) {
return _mm512_add_pd(a, b); }
1015 static inline reg sub(reg a, reg b) {
return _mm512_sub_pd(a, b); }
1017 static inline reg mul(reg a, reg b) {
1019 __m512d a_real = _mm512_shuffle_pd(a, a, 0b00000000);
1020 __m512d a_imag = _mm512_shuffle_pd(a, a, 0b11111111);
1021 __m512d b_real = _mm512_shuffle_pd(b, b, 0b00000000);
1022 __m512d b_imag = _mm512_shuffle_pd(b, b, 0b11111111);
1024 __m512d
real = _mm512_sub_pd(_mm512_mul_pd(a_real, b_real), _mm512_mul_pd(a_imag, b_imag));
1025 __m512d imag = _mm512_add_pd(_mm512_mul_pd(a_real, b_imag), _mm512_mul_pd(a_imag, b_real));
1027 return _mm512_unpacklo_pd(
real, imag);
1030 static inline reg fmadd(reg a, reg b, reg c) {
1031#if defined(__AVX512F__) && defined(__FMA__)
1032 return _mm512_fmadd_pd(a, b, c);
1034 reg result = mul(a, b);
1035 return _mm512_add_pd(result, c);
1039 static inline reg setzero() {
return _mm512_setzero_pd(); }
1040 static inline reg zero() {
return _mm512_setzero_pd(); }
1041 static inline reg andnot(reg a, reg b) {
return _mm512_andnot_pd(a, b); }
1042 static inline reg max(reg a, reg b) {
return _mm512_max_pd(a, b); }
1043 static inline reg min(reg a, reg b) {
return _mm512_min_pd(a, b); }
1045 static inline std::complex<double> horizontal_add(reg v) {
1046 alignas(64)
double values[8];
1047 _mm512_storeu_pd(values, v);
1048 return std::complex<double>(
1049 values[0] + values[2] + values[4] + values[6],
1050 values[1] + values[3] + values[5] + values[7]
bool supports_avx512()
Definition SIMD.hpp:49
void dispatch_simd(F &&f)
Definition SIMD.hpp:72
__m256 extractf32x8_ps_fallback(__m512 v, int imm8)
Definition SIMD.hpp:87
bool supports_avx2()
Definition SIMD.hpp:55
#define SIMD_WIDTH
Definition SIMD.hpp:27
bool supports_sse()
Definition SIMD.hpp:61
#define ALIGN
Definition SIMD.hpp:26
Multi-dimensional tensor class with fixed rank and SIMD support.
Definition Tensor.hpp:25
Definition Derivate.hpp:24
T GemmKernelBigger< T >::blockA_packed[MC *KC] __attribute__((aligned(64)))
T value
Definition SIMD.hpp:34
__m256 reg
Definition SIMD.hpp:12
static constexpr size_t alignment
Definition SIMD.hpp:12
static constexpr size_t width
Definition SIMD.hpp:12
__m512 reg
Definition SIMD.hpp:13
static constexpr size_t width
Definition SIMD.hpp:13
static constexpr size_t alignment
Definition SIMD.hpp:13
static reg loadu(const double *ptr)
Definition SIMD.hpp:406
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:410
static void storeu(double *ptr, reg x)
Definition SIMD.hpp:408
static reg add(reg a, reg b)
Definition SIMD.hpp:411
static reg load(const double *ptr)
Definition SIMD.hpp:405
static reg set1(double x)
Definition SIMD.hpp:384
static void store(double *ptr, reg x)
Definition SIMD.hpp:407
static reg broadcast(const double *ptr)
Definition SIMD.hpp:399
static reg setzero()
Definition SIMD.hpp:403
static reg max(reg a, reg b)
Definition SIMD.hpp:416
static reg andnot(reg a, reg b)
Definition SIMD.hpp:414
static void stream(double *ptr, reg x)
Definition SIMD.hpp:402
static reg mul(reg a, reg b)
Definition SIMD.hpp:412
static reg zero()
Definition SIMD.hpp:409
static double extract(reg x, size_t index)
Definition SIMD.hpp:388
static float horizontal_add(reg v)
Definition SIMD.hpp:404
__m256d reg
Definition SIMD.hpp:380
static reg maskload(const double *ptr, __m256i m)
Definition SIMD.hpp:396
static reg sub(reg a, reg b)
Definition SIMD.hpp:413
static reg set(double a, double b, double c, double d)
Definition SIMD.hpp:385
static void store_stream(double *ptr, reg x)
Definition SIMD.hpp:415
static void maskstore(double *ptr, __m256i mask, __m256d value)
Definition SIMD.hpp:393
static reg load(const double *ptr)
Definition SIMD.hpp:265
static reg set(double a, double b)
Definition SIMD.hpp:242
static reg max(reg a, reg b)
Definition SIMD.hpp:276
__m128d reg
Definition SIMD.hpp:237
static double extract(reg x, size_t index)
Definition SIMD.hpp:243
static reg min(reg a, reg b)
Definition SIMD.hpp:277
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:270
static reg zero()
Definition SIMD.hpp:269
static reg add(reg a, reg b)
Definition SIMD.hpp:271
static void maskstore(double *ptr, __m128i mask, __m128d value)
Definition SIMD.hpp:255
static reg set1(double x)
Definition SIMD.hpp:241
static void store_stream(double *ptr, reg x)
Definition SIMD.hpp:275
static void storeu(double *ptr, reg x)
Definition SIMD.hpp:268
static reg setzero()
Definition SIMD.hpp:249
static reg sub(reg a, reg b)
Definition SIMD.hpp:273
static double horizontal_add(reg v)
Definition SIMD.hpp:250
static reg loadu(const double *ptr)
Definition SIMD.hpp:266
static reg mul(reg a, reg b)
Definition SIMD.hpp:272
static reg andnot(reg a, reg b)
Definition SIMD.hpp:274
static void stream(double *ptr, reg x)
Definition SIMD.hpp:248
static void store(double *ptr, reg x)
Definition SIMD.hpp:267
static void stream(float *ptr, reg x)
Definition SIMD.hpp:358
static reg maskload(const float *ptr, __m256i m)
Definition SIMD.hpp:352
static reg sub(reg a, reg b)
Definition SIMD.hpp:370
static reg set(float a, float b, float c, float d)
Definition SIMD.hpp:330
__m256 reg
Definition SIMD.hpp:325
static reg permute(reg x)
Definition SIMD.hpp:337
static void storeu(float *ptr, reg x)
Definition SIMD.hpp:365
static reg loadu(const float *ptr)
Definition SIMD.hpp:363
static void store(float *ptr, reg x)
Definition SIMD.hpp:364
static reg load(const float *ptr)
Definition SIMD.hpp:362
static reg zero()
Definition SIMD.hpp:366
static reg broadcast(const float *ptr)
Definition SIMD.hpp:355
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:367
static reg fma(reg a, reg b, reg c)
Definition SIMD.hpp:360
static float extract(reg x, size_t index)
Definition SIMD.hpp:347
static reg setzero()
Definition SIMD.hpp:359
static void store_stream(float *ptr, reg x)
Definition SIMD.hpp:372
static reg mul(reg a, reg b)
Definition SIMD.hpp:369
static float horizontal_add(reg v)
Definition SIMD.hpp:361
static void maskstore(float *ptr, __m256i mask, __m256 value)
Definition SIMD.hpp:333
static reg andnot(reg a, reg b)
Definition SIMD.hpp:371
static reg add(reg a, reg b)
Definition SIMD.hpp:368
static reg min(reg a, reg b)
Definition SIMD.hpp:374
static reg set8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
Definition SIMD.hpp:341
static reg max(reg a, reg b)
Definition SIMD.hpp:373
static reg set1(float x)
Definition SIMD.hpp:329
static reg min(reg a, reg b)
Definition SIMD.hpp:232
static reg max(reg a, reg b)
Definition SIMD.hpp:231
static void store_stream(float *ptr, reg x)
Definition SIMD.hpp:230
static float extract(reg x, size_t index)
Definition SIMD.hpp:207
static reg sub(reg a, reg b)
Definition SIMD.hpp:228
static reg permute(reg x)
Definition SIMD.hpp:189
static void maskstore(float *ptr, reg mask, reg value)
Definition SIMD.hpp:197
static reg set(float a, float b, float c, float d)
Definition SIMD.hpp:185
static void stream(float *ptr, reg x)
Definition SIMD.hpp:212
static reg zero()
Definition SIMD.hpp:224
static float horizontal_add(reg v)
Definition SIMD.hpp:215
static reg mul(reg a, reg b)
Definition SIMD.hpp:227
static reg fma(reg a, reg b, reg c)
Definition SIMD.hpp:214
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:225
static reg andnot(reg a, reg b)
Definition SIMD.hpp:229
static void store(float *ptr, reg x)
Definition SIMD.hpp:222
static reg loadu(const float *ptr)
Definition SIMD.hpp:221
__m128 reg
Definition SIMD.hpp:180
static reg add(reg a, reg b)
Definition SIMD.hpp:226
static reg set1(float x)
Definition SIMD.hpp:184
static void storeu(float *ptr, reg x)
Definition SIMD.hpp:223
static reg setzero()
Definition SIMD.hpp:213
static reg set4(float a0, float a1, float a2, float a3)
Definition SIMD.hpp:192
static reg load(const float *ptr)
Definition SIMD.hpp:220
static void storeu(uint64_t *ptr, reg x)
Definition SIMD.hpp:445
static void stream(uint64_t *ptr, reg x)
Definition SIMD.hpp:434
static void store(uint64_t *ptr, reg x)
Definition SIMD.hpp:444
static reg set_epi64(int64_t a, int64_t b, int64_t c, int64_t d)
Definition SIMD.hpp:460
__m256i reg
Definition SIMD.hpp:421
static reg zero()
Definition SIMD.hpp:454
static reg max(reg a, reg b)
Definition SIMD.hpp:463
static reg add(reg a, reg b)
Definition SIMD.hpp:456
static void store_stream(uint64_t *ptr, reg x)
Definition SIMD.hpp:459
static reg setzero()
Definition SIMD.hpp:435
static reg set(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
Definition SIMD.hpp:426
static reg mul(reg a, reg b)
Definition SIMD.hpp:446
static reg load(const size_t *ptr)
Definition SIMD.hpp:437
static reg set1(uint64_t x)
Definition SIMD.hpp:425
static reg loadu(const uint64_t *ptr)
Definition SIMD.hpp:443
static uint64_t extract(reg x, size_t index)
Definition SIMD.hpp:429
static float horizontal_add(reg v)
Definition SIMD.hpp:436
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:455
static reg andnot(reg a, reg b)
Definition SIMD.hpp:458
static reg sub(reg a, reg b)
Definition SIMD.hpp:457
static void stream(uint64_t *ptr, reg x)
Definition SIMD.hpp:293
static reg setzero()
Definition SIMD.hpp:294
static reg sub(reg a, reg b)
Definition SIMD.hpp:315
static reg zero()
Definition SIMD.hpp:304
static void storeu(uint64_t *ptr, reg x)
Definition SIMD.hpp:303
static void store(uint64_t *ptr, reg x)
Definition SIMD.hpp:302
static reg loadu(const uint64_t *ptr)
Definition SIMD.hpp:301
static uint64_t extract(reg x, size_t index)
Definition SIMD.hpp:288
static reg set(uint64_t a, uint64_t b)
Definition SIMD.hpp:287
static reg add(reg a, reg b)
Definition SIMD.hpp:314
static reg max(reg a, reg b)
Definition SIMD.hpp:319
static reg load(const uint64_t *ptr)
Definition SIMD.hpp:300
static reg min(reg a, reg b)
Definition SIMD.hpp:320
__m128i reg
Definition SIMD.hpp:282
static reg andnot(reg a, reg b)
Definition SIMD.hpp:316
static float horizontal_add(reg v)
Definition SIMD.hpp:295
static reg mul(reg a, reg b)
Definition SIMD.hpp:306
static reg set_epi64(int64_t a, int64_t b)
Definition SIMD.hpp:318
static reg set1(uint64_t x)
Definition SIMD.hpp:286
static void store_stream(uint64_t *ptr, reg x)
Definition SIMD.hpp:317
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:305
static reg sub(reg a, reg b)
Definition SIMD.hpp:824
static reg set(std::complex< double > a, std::complex< double > b)
Definition SIMD.hpp:797
static reg max(reg a, reg b)
Definition SIMD.hpp:856
static reg set1(std::complex< double > x)
Definition SIMD.hpp:803
static reg broadcast(const std::complex< double > *ptr)
Definition SIMD.hpp:818
static reg load(const std::complex< double > *ptr)
Definition SIMD.hpp:806
static reg add(reg a, reg b)
Definition SIMD.hpp:823
static reg maskload(const std::complex< double > *ptr, __m256i mask)
Definition SIMD.hpp:868
static reg fma(reg a, reg b, reg c)
Definition SIMD.hpp:859
static reg setzero()
Definition SIMD.hpp:858
static reg mul(reg a, reg b)
Definition SIMD.hpp:825
__m256d reg
Definition SIMD.hpp:793
static void storeu(std::complex< double > *ptr, reg x)
Definition SIMD.hpp:815
static std::complex< double > horizontal_add(reg v)
Definition SIMD.hpp:860
static void store(std::complex< double > *ptr, reg x)
Definition SIMD.hpp:812
static reg loadu(const std::complex< double > *ptr)
Definition SIMD.hpp:809
static void maskstore(std::complex< double > *ptr, __m256i mask, reg v)
Definition SIMD.hpp:871
static reg min(reg a, reg b)
Definition SIMD.hpp:857
static reg andnot(reg a, reg b)
Definition SIMD.hpp:855
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:851
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:694
static reg sub(reg a, reg b)
Definition SIMD.hpp:682
static reg set(std::complex< double > a, std::complex< double > b)
Definition SIMD.hpp:658
static std::complex< double > extract(reg x, size_t=0)
Definition SIMD.hpp:670
static reg max(reg a, reg b)
Definition SIMD.hpp:699
static reg load(const std::complex< double > *ptr)
Definition SIMD.hpp:664
static reg andnot(reg a, reg b)
Definition SIMD.hpp:698
static reg setzero()
Definition SIMD.hpp:701
static reg set1(std::complex< double > x)
Definition SIMD.hpp:661
__m128d reg
Definition SIMD.hpp:654
static void storeu(std::complex< double > *ptr, reg x)
Definition SIMD.hpp:678
static reg min(reg a, reg b)
Definition SIMD.hpp:700
static void store(std::complex< double > *ptr, reg x)
Definition SIMD.hpp:675
static reg mul(reg a, reg b)
Definition SIMD.hpp:683
static reg add(reg a, reg b)
Definition SIMD.hpp:681
static reg loadu(const std::complex< double > *ptr)
Definition SIMD.hpp:667
static reg broadcast(const std::complex< float > *ptr)
Definition SIMD.hpp:734
static reg load(const std::complex< float > *ptr)
Definition SIMD.hpp:723
static void maskstore(std::complex< float > *ptr, __m256i mask, reg v)
Definition SIMD.hpp:786
static reg setzero()
Definition SIMD.hpp:772
static std::complex< float > extract(reg x, size_t index)
Definition SIMD.hpp:729
static reg min(reg a, reg b)
Definition SIMD.hpp:771
static reg sub(reg a, reg b)
Definition SIMD.hpp:750
static void stream(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:746
static reg fmadd(reg a, reg b, reg c)
Definition SIMD.hpp:765
static reg set(std::complex< float > a, std::complex< float > b, std::complex< float > c, std::complex< float > d)
Definition SIMD.hpp:710
__m256 reg
Definition SIMD.hpp:706
static reg max(reg a, reg b)
Definition SIMD.hpp:770
static reg zero()
Definition SIMD.hpp:773
static reg loadu(const std::complex< float > *ptr)
Definition SIMD.hpp:726
static void storeu(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:743
static reg add(reg a, reg b)
Definition SIMD.hpp:749
static reg maskload(const std::complex< float > *ptr, __m256i mask)
Definition SIMD.hpp:783
static reg set1(std::complex< float > x)
Definition SIMD.hpp:719
static reg fma(reg a, reg b, reg c)
Definition SIMD.hpp:774
static void store(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:740
static std::complex< float > horizontal_add(reg v)
Definition SIMD.hpp:775
static reg andnot(reg a, reg b)
Definition SIMD.hpp:769
static reg mul(reg a, reg b)
Definition SIMD.hpp:751
static reg andnot(reg a, reg b)
Definition SIMD.hpp:634
static reg loadu(const std::complex< float > *ptr)
Definition SIMD.hpp:612
static void storeu(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:618
static std::complex< float > horizontal_add(reg v)
Definition SIMD.hpp:639
static void stream(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:644
static reg set(std::complex< float > a, std::complex< float > b)
Definition SIMD.hpp:598
static void store(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:615
static reg load(const std::complex< float > *ptr)
Definition SIMD.hpp:604
__m128 reg
Definition SIMD.hpp:594
static reg max(reg a, reg b)
Definition SIMD.hpp:635
static reg fma(reg a, reg b, reg c)
Definition SIMD.hpp:638
static reg min(reg a, reg b)
Definition SIMD.hpp:636
static reg setzero()
Definition SIMD.hpp:637
static reg mul(reg a, reg b)
Definition SIMD.hpp:623
static std::complex< float > extract(reg x, size_t index)
Definition SIMD.hpp:607
static reg set1(std::complex< float > x)
Definition SIMD.hpp:601
static reg sub(reg a, reg b)
Definition SIMD.hpp:622
static void store_stream(std::complex< float > *ptr, reg x)
Definition SIMD.hpp:647
static reg add(reg a, reg b)
Definition SIMD.hpp:621
__m128 reg
Definition SIMD.hpp:11
static constexpr size_t alignment
Definition SIMD.hpp:11
static constexpr size_t width
Definition SIMD.hpp:11