1 #include "simd.h"
2 
3 ENTRY(simd_test);
4 
5 #if defined(__AVX512F__)
6 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
7 # if VEC_SIZE == 4
8 #  define eq(x, y) ({ \
9     float x_ = (x)[0]; \
10     float __attribute__((vector_size(16))) y_ = { (y)[0] }; \
11     unsigned short r_; \
12     asm ( "vcmpss $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
13     r_ == 1; \
14 })
15 # elif VEC_SIZE == 8
16 #  define eq(x, y) ({ \
17     double x_ = (x)[0]; \
18     double __attribute__((vector_size(16))) y_ = { (y)[0] }; \
19     unsigned short r_; \
20     asm ( "vcmpsd $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
21     r_ == 1; \
22 })
23 # elif FLOAT_SIZE == 4
24 /*
25  * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
26  * that its return type is QI rather than UQI, and hence the value would get
27  * sign-extended before comapring to ALL_TRUE. The same oddity does not matter
28  * for __builtin_ia32_cmppd256_mask(), as there only 4 bits are significant.
29  * Hence the extra " & ALL_TRUE".
30  */
31 #  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
32 # elif FLOAT_SIZE == 8
33 #  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
34 # elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
35 #  define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
36 # elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
37 #  define eq(x, y) (B(pcmpeqw, _mask, (vhi_t)(x), (vhi_t)(y), -1) == ALL_TRUE)
38 # elif INT_SIZE == 4 || UINT_SIZE == 4
39 #  define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
40 # elif INT_SIZE == 8 || UINT_SIZE == 8
41 #  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
42 # endif
43 #elif VEC_SIZE == 8 && defined(__SSE__)
44 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
45 #elif VEC_SIZE == 16
46 # if defined(__AVX__) && defined(FLOAT_SIZE)
47 #  if ELEM_SIZE == 4
48 #   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
49 #  elif ELEM_SIZE == 8
50 #   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
51 #  endif
52 # elif defined(__SSE4_1__)
53 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
54 # elif defined(__SSE__) && ELEM_SIZE == 4
55 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
56 # elif defined(__SSE2__)
57 #  if ELEM_SIZE == 8
58 #   define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
59 #  else
60 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
61 #  endif
62 # endif
63 #elif VEC_SIZE == 32
64 # if defined(__AVX2__)
65 #  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
66 # elif defined(__AVX__) && ELEM_SIZE == 4
67 #  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
68 # elif defined(__AVX__) && ELEM_SIZE == 8
69 #  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
70 # endif
71 #endif
72 
73 #ifndef to_bool
_to_bool(byte_vec_t bv)74 static inline bool _to_bool(byte_vec_t bv)
75 {
76     unsigned int i;
77 
78     for ( i = 0; i < VEC_SIZE; ++i )
79         if ( bv[i] != 0xff )
80             return false;
81 
82     return true;
83 }
84 # define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
85 #endif
86 
87 #ifndef eq
88 # define eq(x, y) to_bool((x) == (y))
89 #endif
90 
91 #if VEC_SIZE == FLOAT_SIZE
92 # define to_int(x) ({ int i_ = (x)[0]; touch(i_); ((vec_t){ i_ }); })
93 # ifdef __x86_64__
94 #  define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); })
95 # endif
96 # ifdef __AVX512F__
97 /*
98  * Sadly even gcc 9.x, at the time of writing, does not carry out at least
99  * uint -> FP conversions using VCVTUSI2S{S,D}, so we need to use builtins
100  * or inline assembly here. The full-vector parameter types of the builtins
101  * aren't very helpful for our purposes, so use inline assembly.
102  */
103 #  if FLOAT_SIZE == 4
104 #   define to_u_int(type, x) ({ \
105     unsigned type u_; \
106     float __attribute__((vector_size(16))) t_; \
107     asm ( "vcvtss2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
108     asm ( "vcvtusi2ss%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
109     (vec_t){ t_[0] }; \
110 })
111 #  elif FLOAT_SIZE == 8
112 #   define to_u_int(type, x) ({ \
113     unsigned type u_; \
114     double __attribute__((vector_size(16))) t_; \
115     asm ( "vcvtsd2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
116     asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
117     (vec_t){ t_[0] }; \
118 })
119 #  endif
120 #  define to_uint(x) to_u_int(int, x)
121 #  ifdef __x86_64__
122 #   define to_uwint(x) to_u_int(long, x)
123 #  endif
124 # endif
125 #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
126 # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
127 #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
128       (VEC_SIZE == 64 || defined(__AVX512VL__))
129 # if FLOAT_SIZE == 4
130 #  define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
131 #  define to_uint(x) BR(cvtudq2ps, _mask, BR(cvtps2udq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
132 #  ifdef __AVX512DQ__
133 #   define to_w_int(x, s) ({ \
134     vsf_half_t t_ = low_half(x); \
135     vdi_t lo_, hi_; \
136     touch(t_); \
137     lo_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \
138     t_ = high_half(x); \
139     touch(t_); \
140     hi_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \
141     touch(lo_); touch(hi_); \
142     insert_half(insert_half(undef(), \
143                             BR(cvt ## s ## qq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \
144                 BR(cvt ## s ## qq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \
145 })
146 #   define to_wint(x) to_w_int(x, )
147 #   define to_uwint(x) to_w_int(x, u)
148 #  endif
149 # elif FLOAT_SIZE == 8
150 #  define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
151 #  define to_uint(x) B(cvtudq2pd, _mask, BR(cvtpd2udq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
152 #  ifdef __AVX512DQ__
153 #   define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
154 #   define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
155 #  endif
156 # endif
157 #elif VEC_SIZE == 16 && defined(__SSE2__)
158 # if FLOAT_SIZE == 4
159 #  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
160 # elif FLOAT_SIZE == 8
161 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
162 # endif
163 #elif VEC_SIZE == 32 && defined(__AVX__)
164 # if FLOAT_SIZE == 4
165 #  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
166 # elif FLOAT_SIZE == 8
167 #  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
168 # endif
169 #endif
170 
171 #if VEC_SIZE == FLOAT_SIZE
172 # define scalar_1op(x, op) ({ \
173     typeof((x)[0]) __attribute__((vector_size(16))) r_; \
174     asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
175     (vec_t){ r_[0] }; \
176 })
177 # define scalar_2op(x, y, op) ({ \
178     typeof((x)[0]) __attribute__((vector_size(16))) r_ = { x[0] }; \
179     asm ( op : [out] "=&x" (r_) : [in1] "[out]" (r_), [in2] "m" (y) ); \
180     (vec_t){ r_[0] }; \
181 })
182 #endif
183 
184 #if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__)
185 # define low_half(x) (x)
186 # define high_half(x) B_(movhlps, , undef(), x)
187 /*
188  * GCC 7 (and perhaps earlier) report a bogus type mismatch for the conditional
189  * expression below. All works well with this no-op wrapper.
190  */
movlhps(vec_t x,vec_t y)191 static inline vec_t movlhps(vec_t x, vec_t y) {
192     return __builtin_ia32_movlhps(x, y);
193 }
194 # define insert_pair(x, y, p) \
195     ((p) ? movlhps(x, y) \
196          : ({ vec_t t_ = (x); t_[0] = (y)[0]; t_[1] = (y)[1]; t_; }))
197 #endif
198 
199 #if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
200 # define max __builtin_ia32_pfmax
201 # define min __builtin_ia32_pfmin
202 # define recip(x) ({ \
203     vec_t t_ = __builtin_ia32_pfrcp(x); \
204     touch(x); \
205     t_[1] = __builtin_ia32_pfrcp(__builtin_ia32_pswapdsf(x))[0]; \
206     touch(x); \
207     __builtin_ia32_pfrcpit2(__builtin_ia32_pfrcpit1(t_, x), t_); \
208 })
209 # define rsqrt(x) ({ \
210     vec_t t_ = __builtin_ia32_pfrsqrt(x); \
211     touch(x); \
212     t_[1] = __builtin_ia32_pfrsqrt(__builtin_ia32_pswapdsf(x))[0]; \
213     touch(x); \
214     __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
215 })
216 #elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
217 # if FLOAT_SIZE == 4
218 #  define getexp(x) scalar_1op(x, "vgetexpss %[in], %[out], %[out]")
219 #  define getmant(x) scalar_1op(x, "vgetmantss $0, %[in], %[out], %[out]")
220 #  ifdef __AVX512ER__
221 #   define recip(x) scalar_1op(x, "vrcp28ss %[in], %[out], %[out]")
222 #   define rsqrt(x) scalar_1op(x, "vrsqrt28ss %[in], %[out], %[out]")
223 #  else
224 #   define recip(x) scalar_1op(x, "vrcp14ss %[in], %[out], %[out]")
225 #   define rsqrt(x) scalar_1op(x, "vrsqrt14ss %[in], %[out], %[out]")
226 #  endif
227 #  define scale(x, y) scalar_2op(x, y, "vscalefss %[in2], %[in1], %[out]")
228 #  define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
229 #  define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]")
230 # elif FLOAT_SIZE == 8
231 #  define getexp(x) scalar_1op(x, "vgetexpsd %[in], %[out], %[out]")
232 #  define getmant(x) scalar_1op(x, "vgetmantsd $0, %[in], %[out], %[out]")
233 #  ifdef __AVX512ER__
234 #   define recip(x) scalar_1op(x, "vrcp28sd %[in], %[out], %[out]")
235 #   define rsqrt(x) scalar_1op(x, "vrsqrt28sd %[in], %[out], %[out]")
236 #  else
237 #   define recip(x) scalar_1op(x, "vrcp14sd %[in], %[out], %[out]")
238 #   define rsqrt(x) scalar_1op(x, "vrsqrt14sd %[in], %[out], %[out]")
239 #  endif
240 #  define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]")
241 #  define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
242 #  define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
243 # endif
244 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
245       (VEC_SIZE == 64 || defined(__AVX512VL__))
246 # if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
247      (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
248      (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
249 #  define _half(x, lh) ({ \
250     half_t t_; \
251     asm ( "vextractf%c[w]x%c[n] %[sel], %[s], %[d]" \
252           : [d] "=m" (t_) \
253           : [s] "v" (x), [sel] "i" (lh), \
254             [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
255     t_; \
256 })
257 #  define low_half(x)  _half(x, 0)
258 #  define high_half(x) _half(x, 1)
259 # endif
260 # if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
261      (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
262 #  define low_quarter(x) ({ \
263     quarter_t t_; \
264     asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
265           : [d] "=m" (t_) \
266           : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
267     t_; \
268 })
269 # endif
270 # if FLOAT_SIZE == 4
271 #  define broadcast(x) ({ \
272     vec_t t_; \
273     asm ( "%{evex%} vbroadcastss %1, %0" \
274           : "=v" (t_) : "m" (*(float[1]){ x }) ); \
275     t_; \
276 })
277 #  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
278 #   define broadcast_pair(x) ({ \
279     vec_t t_; \
280     asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
281     t_; \
282 })
283 #  endif
284 #  if VEC_SIZE == 64 && defined(__AVX512DQ__)
285 #   define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
286 #   define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
287 #  endif
288 #  ifdef __AVX512DQ__
289 #   define frac(x) B(reduceps, _mask, x, 0b00001011, undef(), ~0)
290 #  endif
291 #  define getexp(x) BR(getexpps, _mask, x, undef(), ~0)
292 #  define getmant(x) BR(getmantps, _mask, x, 0, undef(), ~0)
293 #  ifdef __AVX512DQ__
294 #   define max(x, y) BR(rangeps, _mask, x, y, 0b0101, undef(), ~0)
295 #   define min(x, y) BR(rangeps, _mask, x, y, 0b0100, undef(), ~0)
296 #  else
297 #   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
298 #   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
299 #  endif
300 #  define mix(x, y) B(blendmps_, _mask, x, y, (0b1010101010101010 & ALL_TRUE))
301 #  define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0)
302 #  if VEC_SIZE == 64 && defined(__AVX512ER__)
303 #   define recip(x) BR(rcp28ps, _mask, x, undef(), ~0)
304 #   define rsqrt(x) BR(rsqrt28ps, _mask, x, undef(), ~0)
305 #  else
306 #   define recip(x) B(rcp14ps, _mask, x, undef(), ~0)
307 #   define rsqrt(x) B(rsqrt14ps, _mask, x, undef(), ~0)
308 #  endif
309 #  define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
310 #  define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
311 #  define trunc(x) BR(rndscaleps_, _mask, x, 0b1011, undef(), ~0)
312 #  define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
313 #  if VEC_SIZE == 16
314 #   define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
315 #   define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
316 #   define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
317 #   define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0)
318 #  else
319 #   define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
320 #   define insert_pair(x, y, p) \
321     B(insertf32x4_, _mask, x, \
322       /* Cast needed below to work around gcc 7.x quirk. */ \
323       (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
324       (p) >> 1, x, 3 << ((p) * 2))
325 #   define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
326 #   define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
327 #   define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
328 #   define swap(x) ({ \
329     vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
330     B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
331 })
332 #   define swap2(x) B(vpermilps, _mask, \
333                        B(shuf_f32x4_, _mask, x, x, \
334                          VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
335                        0b00011011, undef(), ~0)
336 #  endif
337 # elif FLOAT_SIZE == 8
338 #  if VEC_SIZE >= 32
339 #   define broadcast(x) ({ \
340     vec_t t_; \
341     asm ( "%{evex%} vbroadcastsd %1, %0" : "=v" (t_) \
342           : "m" (*(double[1]){ x }) ); \
343     t_; \
344 })
345 #  else
346 #   define broadcast(x) ({ \
347     vec_t t_; \
348     asm ( "%{evex%} vpbroadcastq %1, %0" \
349           : "=v" (t_) : "m" (*(double[1]){ x }) ); \
350     t_; \
351 })
352 #  endif
353 #  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
354 #   define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
355 #   define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
356 #  endif
357 #  if VEC_SIZE == 64
358 #   define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
359 #   define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
360 #  endif
361 #  ifdef __AVX512DQ__
362 #   define frac(x) B(reducepd, _mask, x, 0b00001011, undef(), ~0)
363 #  endif
364 #  define getexp(x) BR(getexppd, _mask, x, undef(), ~0)
365 #  define getmant(x) BR(getmantpd, _mask, x, 0, undef(), ~0)
366 #  ifdef __AVX512DQ__
367 #   define max(x, y) BR(rangepd, _mask, x, y, 0b0101, undef(), ~0)
368 #   define min(x, y) BR(rangepd, _mask, x, y, 0b0100, undef(), ~0)
369 #  else
370 #   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
371 #   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
372 #  endif
373 #  define mix(x, y) B(blendmpd_, _mask, x, y, 0b10101010)
374 #  define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0)
375 #  if VEC_SIZE == 64 && defined(__AVX512ER__)
376 #   define recip(x) BR(rcp28pd, _mask, x, undef(), ~0)
377 #   define rsqrt(x) BR(rsqrt28pd, _mask, x, undef(), ~0)
378 #  else
379 #   define recip(x) B(rcp14pd, _mask, x, undef(), ~0)
380 #   define rsqrt(x) B(rsqrt14pd, _mask, x, undef(), ~0)
381 #  endif
382 #  define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0)
383 #  define trunc(x) BR(rndscalepd_, _mask, x, 0b1011, undef(), ~0)
384 #  if VEC_SIZE == 16
385 #   define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
386 #   define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
387 #   define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
388 #   define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0)
389 #  else
390 #   define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
391 #   define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
392 #   define swap(x) ({ \
393     vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
394     B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
395 })
396 #   define swap2(x) B(vpermilpd, _mask, \
397                        B(shuf_f64x2_, _mask, x, x, \
398                          VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
399                        0b01010101, undef(), ~0)
400 #  endif
401 # endif
402 #elif FLOAT_SIZE == 4 && defined(__SSE__)
403 # if VEC_SIZE == 32 && defined(__AVX__)
404 #  if defined(__AVX2__)
405 #   define broadcast(x) \
406     __builtin_ia32_vbroadcastss_ps256((float __attribute__((vector_size(16)))){ x })
407 #  else
408 #   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
409 #  endif
410 #  define max(x, y) __builtin_ia32_maxps256(x, y)
411 #  define min(x, y) __builtin_ia32_minps256(x, y)
412 #  define recip(x) __builtin_ia32_rcpps256(x)
413 #  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
414 #  define sqrt(x) __builtin_ia32_sqrtps256(x)
415 #  define swap(x) ({ \
416     vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
417     __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
418 })
419 #  ifdef __AVX2__
420 #   define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
421 #  else
422 #   define swap2(x) ({ \
423         vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
424         __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
425 })
426 #  endif
427 # elif VEC_SIZE == 16
428 #  if defined(__AVX2__)
429 #   define broadcast(x) __builtin_ia32_vbroadcastss_ps((vec_t){ x })
430 #  elif defined(__AVX__)
431 #   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
432 #  endif
433 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
434 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
435 #  define max(x, y) __builtin_ia32_maxps(x, y)
436 #  define min(x, y) __builtin_ia32_minps(x, y)
437 #  define recip(x) __builtin_ia32_rcpps(x)
438 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
439 #  define sqrt(x) __builtin_ia32_sqrtps(x)
440 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
441 #  ifdef __AVX__
442 #   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
443 #  endif
444 # elif VEC_SIZE == 4
445 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
446 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
447 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
448 # endif
449 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
450 # if VEC_SIZE == 32 && defined(__AVX__)
451 #  if defined(__AVX2__)
452 #   define broadcast(x) \
453     __builtin_ia32_vbroadcastsd_pd256((double __attribute__((vector_size(16)))){ x })
454 #  else
455 #   define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
456 #  endif
457 #  define max(x, y) __builtin_ia32_maxpd256(x, y)
458 #  define min(x, y) __builtin_ia32_minpd256(x, y)
459 #  define recip(x) ({ \
460     float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
461     t_ = __builtin_ia32_vextractf128_ps256( \
462              __builtin_ia32_rcpps256( \
463                  __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
464     __builtin_ia32_cvtps2pd256(t_); \
465 })
466 #  define rsqrt(x) ({ \
467     float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
468     float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
469     t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
470     t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
471     __builtin_ia32_cvtps2pd256(t1_); \
472 })
473 #  define sqrt(x) __builtin_ia32_sqrtpd256(x)
474 #  define swap(x) ({ \
475     vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
476     __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
477 })
478 #  ifdef __AVX2__
479 #   define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
480 #  endif
481 # elif VEC_SIZE == 16
482 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
483 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
484 #  define max(x, y) __builtin_ia32_maxpd(x, y)
485 #  define min(x, y) __builtin_ia32_minpd(x, y)
486 #  define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
487 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
488 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
489 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
490 #  ifdef __AVX__
491 #   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
492                                                        __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
493 #  endif
494 # elif VEC_SIZE == 8
495 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
496 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
497 #  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
498 # endif
499 #endif
500 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
501      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
502 # if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
503      (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \
504      (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
505 #  define low_half(x) ({ \
506     half_t t_; \
507     asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
508           : [d] "=m" (t_) \
509           : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
510     t_; \
511 })
512 # endif
513 # if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
514        (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
515 #  define low_quarter(x) ({ \
516     quarter_t t_; \
517     asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
518           : [d] "=m" (t_) \
519           : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
520     t_; \
521 })
522 # endif
523 # if INT_SIZE == 4 || UINT_SIZE == 4
524 #  define broadcast(x) ({ \
525     vec_t t_; \
526     asm ( "%{evex%} vpbroadcastd %1, %0" \
527           : "=v" (t_) : "m" (*(int[1]){ x }) ); \
528     t_; \
529 })
530 #  define broadcast2(x) ({ \
531     vec_t t_; \
532     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
533     t_; \
534 })
535 #  ifdef __AVX512DQ__
536 #   define broadcast_pair(x) ({ \
537     vec_t t_; \
538     asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
539     t_; \
540 })
541 #  endif
542 #  if VEC_SIZE == 64 && defined(__AVX512DQ__)
543 #   define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0))
544 #   define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
545 #  endif
546 #  if VEC_SIZE == 16
547 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
548 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
549 #   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
550 #  else
551 #   define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0))
552 #   define insert_pair(x, y, p) \
553     (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
554               /* First cast needed below to work around gcc 7.x quirk. */ \
555               (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \
556                       : (vsi_pair_t)(y), \
557               (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
558 #   define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
559 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
560 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
561 #   define swap(x) ((vec_t)B(pshufd, _mask, \
562                              B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
563                                VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
564                              0b00011011, (vsi_t)undef(), ~0))
565 #   define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
566 #  endif
567 #  define mix(x, y) ((vec_t)B(blendmd_, _mask, (vsi_t)(x), (vsi_t)(y), \
568                               (0b1010101010101010 & ((1 << ELEM_COUNT) - 1))))
569 #  define rotr(x, n) ((vec_t)B(alignd, _mask, (vsi_t)(x), (vsi_t)(x), n, (vsi_t)undef(), ~0))
570 #  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
571 # elif INT_SIZE == 8 || UINT_SIZE == 8
572 #  define broadcast(x) ({ \
573     vec_t t_; \
574     asm ( "%{evex%} vpbroadcastq %1, %0" \
575           : "=v" (t_) : "m" (*(long long[1]){ x }) ); \
576     t_; \
577 })
578 #  ifdef __x86_64__
579 #   define broadcast2(x) ({ \
580     vec_t t_; \
581     asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \
582     t_; \
583 })
584 #  endif
585 #  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
586 #   define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0))
587 #   define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
588 #  endif
589 #  if VEC_SIZE == 64
590 #   define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0))
591 #   define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
592 #  endif
593 #  if VEC_SIZE == 16
594 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
595 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
596 #   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, (vsi_t)undef(), ~0))
597 #  else
598 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
599 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
600 #   define swap(x) ((vec_t)B(pshufd, _mask, \
601                              (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
602                                       VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
603                              0b01001110, (vsi_t)undef(), ~0))
604 #   define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
605 #  endif
606 #  define mix(x, y) ((vec_t)B(blendmq_, _mask, (vdi_t)(x), (vdi_t)(y), 0b10101010))
607 #  define rotr(x, n) ((vec_t)B(alignq, _mask, (vdi_t)(x), (vdi_t)(x), n, (vdi_t)undef(), ~0))
608 #  if VEC_SIZE == 32
609 #   define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
610 #  elif VEC_SIZE == 64
611 #   define swap3(x) ({ \
612     vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \
613     B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \
614 })
615 #  endif
616 # endif
617 # if INT_SIZE == 4
618 #  define abs(x) B(pabsd, _mask, x, undef(), ~0)
619 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
620 #  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
621 #  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
622 #  define widen1(x) ((vec_t)B(pmovsxdq, _mask, x, (vdi_t)undef(), ~0))
623 # elif UINT_SIZE == 4
624 #  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
625 #  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
626 #  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
627 #  define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), (vdi_t)undef(), ~0))
628 # elif INT_SIZE == 8
629 #  define abs(x) ((vec_t)B(pabsq, _mask, (vdi_t)(x), (vdi_t)undef(), ~0))
630 #  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
631 #  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
632 # elif UINT_SIZE == 8
633 #  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
634 #  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
635 # endif
636 #elif (INT_SIZE == 1 || UINT_SIZE == 1 || INT_SIZE == 2 || UINT_SIZE == 2) && \
637       defined(__AVX512BW__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
638 # if INT_SIZE == 1 || UINT_SIZE == 1
639 #  define broadcast(x) ({ \
640     vec_t t_; \
641     asm ( "%{evex%} vpbroadcastb %1, %0" \
642           : "=v" (t_) : "m" (*(char[1]){ x }) ); \
643     t_; \
644 })
645 #  define broadcast2(x) ({ \
646     vec_t t_; \
647     asm ( "vpbroadcastb %k1, %0" : "=v" (t_) : "r" (x) ); \
648     t_; \
649 })
650 #  if VEC_SIZE == 16
651 #   define interleave_hi(x, y) ((vec_t)B(punpckhbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
652 #   define interleave_lo(x, y) ((vec_t)B(punpcklbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
653 #   define rotr(x, n) ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), (n) * 8, (vdi_t)undef(), ~0))
654 #   define swap(x) ((vec_t)B(pshufb, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
655 #  elif defined(__AVX512VBMI__)
656 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
657 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
658 #  endif
659 #  define mix(x, y) ((vec_t)B(blendmb_, _mask, (vqi_t)(x), (vqi_t)(y), \
660                               (0b1010101010101010101010101010101010101010101010101010101010101010LL & ALL_TRUE)))
661 #  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
662 #  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
663 #  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
664 #  ifdef __AVX512VBMI__
665 #   define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
666 #  endif
667 # elif INT_SIZE == 2 || UINT_SIZE == 2
668 #  define broadcast(x) ({ \
669     vec_t t_; \
670     asm ( "%{evex%} vpbroadcastw %1, %0" \
671           : "=v" (t_) : "m" (*(short[1]){ x }) ); \
672     t_; \
673 })
674 #  define broadcast2(x) ({ \
675     vec_t t_; \
676     asm ( "vpbroadcastw %k1, %0" : "=v" (t_) : "r" (x) ); \
677     t_; \
678 })
679 #  if VEC_SIZE == 16
680 #   define interleave_hi(x, y) ((vec_t)B(punpckhwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
681 #   define interleave_lo(x, y) ((vec_t)B(punpcklwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
682 #   define rotr(x, n) ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), (n) * 16, (vdi_t)undef(), ~0))
683 #   define swap(x) ((vec_t)B(pshufd, _mask, \
684                              (vsi_t)B(pshufhw, _mask, \
685                                       B(pshuflw, _mask, (vhi_t)(x), 0b00011011, (vhi_t)undef(), ~0), \
686                                       0b00011011, (vhi_t)undef(), ~0), \
687                              0b01001110, (vsi_t)undef(), ~0))
688 #  else
689 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
690 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
691 #  endif
692 #  define mix(x, y) ((vec_t)B(blendmw_, _mask, (vhi_t)(x), (vhi_t)(y), \
693                               (0b10101010101010101010101010101010 & ALL_TRUE)))
694 #  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
695 #  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
696 #  define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
697 # endif
698 # if INT_SIZE == 1
699 #  define abs(x) ((vec_t)B(pabsb, _mask, (vqi_t)(x), (vqi_t)undef(), ~0))
700 #  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
701 #  define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
702 #  define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
703 #  define widen2(x) ((vec_t)B(pmovsxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
704 #  define widen3(x) ((vec_t)B(pmovsxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
705 # elif UINT_SIZE == 1
706 #  define max(x, y) ((vec_t)B(pmaxub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
707 #  define min(x, y) ((vec_t)B(pminub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
708 #  define widen1(x) ((vec_t)B(pmovzxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
709 #  define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
710 #  define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
711 # elif INT_SIZE == 2
712 #  define abs(x) B(pabsw, _mask, x, undef(), ~0)
713 #  define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
714 #  define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
715 #  define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
716 #  define widen1(x) ((vec_t)B(pmovsxwd, _mask, x, (vsi_t)undef(), ~0))
717 #  define widen2(x) ((vec_t)B(pmovsxwq, _mask, x, (vdi_t)undef(), ~0))
718 # elif UINT_SIZE == 2
719 #  define max(x, y) ((vec_t)B(pmaxuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
720 #  define min(x, y) ((vec_t)B(pminuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
721 #  define mul_hi(x, y) ((vec_t)B(pmulhuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
722 #  define widen1(x) ((vec_t)B(pmovzxwd, _mask, (vhi_half_t)(x), (vsi_t)undef(), ~0))
723 #  define widen2(x) ((vec_t)B(pmovzxwq, _mask, (vhi_quarter_t)(x), (vdi_t)undef(), ~0))
724 # endif
725 #elif VEC_SIZE == 16 && defined(__SSE2__)
726 # if INT_SIZE == 1 || UINT_SIZE == 1
727 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
728 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
729 # elif INT_SIZE == 2 || UINT_SIZE == 2
730 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)(x), (vhi_t)(y)))
731 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)(x), (vhi_t)(y)))
732 #  define swap(x) ((vec_t)__builtin_ia32_pshufd( \
733                    (vsi_t)__builtin_ia32_pshufhw( \
734                           __builtin_ia32_pshuflw((vhi_t)(x), 0b00011011), 0b00011011), 0b01001110))
735 # elif INT_SIZE == 4 || UINT_SIZE == 4
736 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)(x), (vsi_t)(y)))
737 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)(x), (vsi_t)(y)))
738 #  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b00011011))
739 # elif INT_SIZE == 8 || UINT_SIZE == 8
740 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)(x), (vdi_t)(y)))
741 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)(x), (vdi_t)(y)))
742 #  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b01001110))
743 # endif
744 # if UINT_SIZE == 1
745 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)(x), (vqi_t)(y)))
746 #  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)(x), (vqi_t)(y)))
747 # elif INT_SIZE == 2
748 #  define max(x, y) __builtin_ia32_pmaxsw128(x, y)
749 #  define min(x, y) __builtin_ia32_pminsw128(x, y)
750 #  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
751 # elif UINT_SIZE == 2
752 #  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)(x), (vhi_t)(y)))
753 # elif UINT_SIZE == 4
754 #  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)(x), (vsi_t)(y)))
755 # endif
756 # define select(d, x, y, m) ({ \
757     void *d_ = (d); \
758     vqi_t m_ = (vqi_t)(m); \
759     __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
760     __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
761 })
762 #elif VEC_SIZE == 32 && defined(__AVX2__)
763 # define swap_lanes(x, y, func, type) ({ \
764     long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
765     type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
766     t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
767     t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
768     func(t1_, t2_); \
769 })
770 # if INT_SIZE == 1 || UINT_SIZE == 1
771 #  define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
772 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
773 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
774                                                        (vdi_t)(x), (n) * 8))
775 # elif INT_SIZE == 2 || UINT_SIZE == 2
776 #  define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
777 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
778 #  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
779 #  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
780 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
781 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
782                                                        (vdi_t)(x), (n) * 16))
783 # elif INT_SIZE == 4 || UINT_SIZE == 4
784 #  define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
785 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
786 #  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
787 #  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
788 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
789 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
790                                                        (vdi_t)(x), (n) * 32))
791 #  define select(d, x, y, m) ({ \
792     vsi_t m_ = (vsi_t)(m); \
793     *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x),  m_); \
794     __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
795 })
796 #  define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
797 # elif INT_SIZE == 8 || UINT_SIZE == 8
798 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
799 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
800                                                        (vdi_t)(x), (n) * 64))
801 #  define select(d, x, y, m) ({ \
802     vdi_t m_ = (vdi_t)(m); \
803     *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x),  m_); \
804     __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
805 })
806 #  define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
807 #  define swap2(x) ({ \
808     vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
809     (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
810 })
811 # endif
812 # if INT_SIZE == 1
813 #  define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
814 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
815 #  define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
816 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
817 #  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
818 #  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
819 # elif UINT_SIZE == 1
820 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
821 #  define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
822 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
823 #  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
824 #  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
825 # elif INT_SIZE == 2
826 #  define abs(x) __builtin_ia32_pabsw256(x)
827 #  define max(x, y) __builtin_ia32_pmaxsw256(x, y)
828 #  define min(x, y) __builtin_ia32_pminsw256(x, y)
829 #  define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
830 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
831 #  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
832 # elif UINT_SIZE == 2
833 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
834 #  define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
835 #  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
836 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
837 #  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
838 # elif INT_SIZE == 4
839 #  define abs(x) __builtin_ia32_pabsd256(x)
840 #  define max(x, y) __builtin_ia32_pmaxsd256(x, y)
841 #  define min(x, y) __builtin_ia32_pminsd256(x, y)
842 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
843 # elif UINT_SIZE == 4
844 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
845 #  define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
846 #  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
847 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
848 # elif INT_SIZE == 8
849 #  define broadcast(x) ({ \
850     long long s_ = (x); \
851     long long __attribute__((vector_size(16))) t_; \
852     vec_t d_; \
853     asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
854     asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
855     d_; \
856 })
857 # elif UINT_SIZE == 8
858 #  define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
859 # endif
860 #endif
861 #if VEC_SIZE == 16 && defined(__SSE3__)
862 # if FLOAT_SIZE == 4
863 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
864 #  define dup_hi(x) __builtin_ia32_movshdup(x)
865 #  define dup_lo(x) __builtin_ia32_movsldup(x)
866 #  define hadd(x, y) __builtin_ia32_haddps(x, y)
867 #  define hsub(x, y) __builtin_ia32_hsubps(x, y)
868 # elif FLOAT_SIZE == 8
869 #  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
870 #  define dup_lo(x) ({ \
871     double __attribute__((vector_size(16))) r_; \
872     asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
873     r_; \
874 })
875 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
876 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
877 # endif
878 #elif VEC_SIZE == 32 && defined(__AVX__)
879 # if FLOAT_SIZE == 4
880 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
881 #  define dup_hi(x) __builtin_ia32_movshdup256(x)
882 #  define dup_lo(x) __builtin_ia32_movsldup256(x)
883 #  ifdef __AVX2__
884 #   define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
885                                                   (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
886 #   define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
887                                                   (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
888 #  else
889 #   define hadd(x, y) ({ \
890         vec_t t_ = __builtin_ia32_haddps256(x, y); \
891         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
892 })
893 #   define hsub(x, y) ({ \
894         vec_t t_ = __builtin_ia32_hsubps256(x, y); \
895         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
896 })
897 #  endif
898 # elif FLOAT_SIZE == 8
899 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
900 #  define dup_lo(x) __builtin_ia32_movddup256(x)
901 #  ifdef __AVX2__
902 #   define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
903 #   define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
904 #  else
905 #   define hadd(x, y) ({ \
906         vec_t t_ = __builtin_ia32_haddpd256(x, y); \
907         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
908 })
909 #   define hsub(x, y) ({ \
910         vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
911         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
912 })
913 #  endif
914 # endif
915 #endif
916 #if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
917 # if INT_SIZE == 1
918 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
919 # elif INT_SIZE == 2
920 #  define abs(x) __builtin_ia32_pabsw128(x)
921 # elif INT_SIZE == 4
922 #  define abs(x) __builtin_ia32_pabsd128(x)
923 # endif
924 # if INT_SIZE == 1 || UINT_SIZE == 1
925 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
926 #  define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
927 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
928 # elif INT_SIZE == 2 || UINT_SIZE == 2
929 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
930 #  define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
931 #  define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
932 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
933 # elif INT_SIZE == 4 || UINT_SIZE == 4
934 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
935 #  define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
936 #  define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
937 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
938 # elif INT_SIZE == 8 || UINT_SIZE == 8
939 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
940 # endif
941 #endif
942 #if VEC_SIZE == 16 && defined(__SSE4_1__) && !defined(__AVX512VL__)
943 # if INT_SIZE == 1
944 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
945 #  define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
946 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
947 #  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
948 #  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
949 # elif INT_SIZE == 2
950 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
951 #  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
952 # elif INT_SIZE == 4
953 #  define max(x, y) __builtin_ia32_pmaxsd128(x, y)
954 #  define min(x, y) __builtin_ia32_pminsd128(x, y)
955 #  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
956 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
957 # elif UINT_SIZE == 1
958 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
959 #  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
960 #  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
961 # elif UINT_SIZE == 2
962 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
963 #  define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
964 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
965 #  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
966 # elif UINT_SIZE == 4
967 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
968 #  define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
969 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
970 # endif
971 # undef select
972 # if defined(INT_SIZE) || defined(UINT_SIZE)
973 #  define select(d, x, y, m) \
974     (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
975 # elif FLOAT_SIZE == 4
976 #  define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
977 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
978 #  define trunc(x) __builtin_ia32_roundps(x, 0b1011)
979 # elif FLOAT_SIZE == 8
980 #  define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
981 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
982 #  define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
983 # endif
984 # if INT_SIZE == 2 || UINT_SIZE == 2
985 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
986 # elif INT_SIZE == 4 || UINT_SIZE == 4
987 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
988 # elif INT_SIZE == 8 || UINT_SIZE == 8
989 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
990 # elif FLOAT_SIZE == 4
991 #  define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
992 # elif FLOAT_SIZE == 8
993 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
994 # endif
995 #endif
996 #if VEC_SIZE == 32 && defined(__AVX__) && !defined(__AVX512VL__)
997 # if FLOAT_SIZE == 4
998 #  define dot_product(x, y) ({ \
999     vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
1000     (vec_t){t_[0] + t_[4]}; \
1001 })
1002 #  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
1003 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
1004 #  define select2(d, x, y, m) ({ \
1005     vsi_t m_ = (vsi_t)(m); \
1006     *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
1007     __builtin_ia32_maskstoreps256(d, ~m_, y); \
1008 })
1009 #  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
1010 # elif FLOAT_SIZE == 8
1011 #  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
1012 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
1013 #  define select2(d, x, y, m) ({ \
1014     vdi_t m_ = (vdi_t)(m); \
1015     *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
1016     __builtin_ia32_maskstorepd256(d, ~m_, y); \
1017 })
1018 #  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
1019 # endif
1020 #endif
1021 #if VEC_SIZE == FLOAT_SIZE
1022 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
1023 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
1024 # if defined(__SSE4_1__) && !defined(__AVX512F__)
1025 #  if FLOAT_SIZE == 4
1026 #   define trunc(x) scalar_1op(x, "roundss $0b1011, %[in], %[out]")
1027 #  elif FLOAT_SIZE == 8
1028 #   define trunc(x) scalar_1op(x, "roundsd $0b1011, %[in], %[out]")
1029 #  endif
1030 # endif
1031 #endif
1032 #ifdef __XOP__
1033 # undef select
1034 # if VEC_SIZE == 16
1035 #  if INT_SIZE == 2 || INT_SIZE == 4
1036 #   include "simd-fma.c"
1037 #  endif
1038 #  define select(d, x, y, m) \
1039     (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
1040 #  if INT_SIZE == 1 || UINT_SIZE == 1
1041 #   define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
1042 #  elif INT_SIZE == 2 || UINT_SIZE == 2
1043 #   define swap2(x) \
1044     ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
1045                                   (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
1046                                           (2 * inv - 2))))
1047 #  elif FLOAT_SIZE == 4
1048 #   define frac(x) __builtin_ia32_vfrczps(x)
1049 #   undef swap2
1050 #   define swap2(x) ({ \
1051     /* Buggy in gcc 7.1.0 and earlier. */ \
1052     /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
1053     vec_t t_; \
1054     asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
1055           "=x" (t_) : \
1056           "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
1057     t_; \
1058 })
1059 #  elif FLOAT_SIZE == 8
1060 #   define frac(x) __builtin_ia32_vfrczpd(x)
1061 #   undef swap2
1062 #   define swap2(x) ({ \
1063     /* Buggy in gcc 7.1.0 and earlier. */ \
1064     /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
1065     /*                            __builtin_ia32_pmovsxdq128( */ \
1066     /*                                __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
1067     vdi_t s_ = __builtin_ia32_pmovsxdq128( \
1068                    __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
1069     vec_t t_; \
1070     asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
1071           "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
1072     t_; \
1073 })
1074 #  endif
1075 #  if INT_SIZE == 1
1076 #   define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
1077                                                          __builtin_ia32_vphaddbw((vqi_t)(y))))
1078 #   define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
1079                                                          __builtin_ia32_vphsubbw((vqi_t)(y))))
1080 #  elif UINT_SIZE == 1
1081 #   define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
1082                                                          __builtin_ia32_vphaddubw((vqi_t)(y))))
1083 #  elif INT_SIZE == 2
1084 #   undef hadd
1085 #   define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
1086                                                  __builtin_ia32_vphaddwd(y))
1087 #   undef hsub
1088 #   define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
1089                                                  __builtin_ia32_vphsubwd(y))
1090 #  elif UINT_SIZE == 2
1091 #   undef hadd
1092 #   define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
1093                                                          __builtin_ia32_vphadduwd((vhi_t)(y))))
1094 #   undef hsub
1095 #  endif
1096 # elif VEC_SIZE == 32
1097 #  define select(d, x, y, m) \
1098     (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
1099 #  if FLOAT_SIZE == 4
1100 #   define frac(x) __builtin_ia32_vfrczps256(x)
1101 #  elif FLOAT_SIZE == 8
1102 #   define frac(x) __builtin_ia32_vfrczpd256(x)
1103 #  endif
1104 # elif VEC_SIZE == FLOAT_SIZE
1105 #  if VEC_SIZE == 4
1106 #   define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
1107 #  elif VEC_SIZE == 8
1108 #   define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
1109 #  endif
1110 # endif
1111 #endif
1112 
1113 #if VEC_SIZE >= 16
1114 
1115 # if !defined(low_half) && defined(HALF_SIZE)
low_half(vec_t x)1116 static inline half_t low_half(vec_t x)
1117 {
1118 #  if HALF_SIZE < VEC_SIZE
1119     half_t y;
1120     unsigned int i;
1121 
1122     for ( i = 0; i < ELEM_COUNT / 2; ++i )
1123         y[i] = x[i];
1124 
1125     return y;
1126 #  else
1127     return x;
1128 #  endif
1129 }
1130 # endif
1131 
1132 # if !defined(low_quarter) && defined(QUARTER_SIZE)
low_quarter(vec_t x)1133 static inline quarter_t low_quarter(vec_t x)
1134 {
1135 #  if QUARTER_SIZE < VEC_SIZE
1136     quarter_t y;
1137     unsigned int i;
1138 
1139     for ( i = 0; i < ELEM_COUNT / 4; ++i )
1140         y[i] = x[i];
1141 
1142     return y;
1143 #  else
1144     return x;
1145 #  endif
1146 }
1147 # endif
1148 
1149 # if !defined(low_eighth) && defined(EIGHTH_SIZE)
low_eighth(vec_t x)1150 static inline eighth_t low_eighth(vec_t x)
1151 {
1152 #  if EIGHTH_SIZE < VEC_SIZE
1153     eighth_t y;
1154     unsigned int i;
1155 
1156     for ( i = 0; i < ELEM_COUNT / 8; ++i )
1157         y[i] = x[i];
1158 
1159     return y;
1160 #  else
1161     return x;
1162 #  endif
1163 }
1164 # endif
1165 
1166 #endif
1167 
1168 #ifdef broadcast_pair
1169 # if ELEM_COUNT == 4
1170 #  define broadcast_half broadcast_pair
1171 # elif ELEM_COUNT == 8
1172 #  define broadcast_quarter broadcast_pair
1173 # elif ELEM_COUNT == 16
1174 #  define broadcast_eighth broadcast_pair
1175 # endif
1176 #endif
1177 
1178 #ifdef insert_pair
1179 # if ELEM_COUNT == 4
1180 #  define insert_half insert_pair
1181 # elif ELEM_COUNT == 8
1182 #  define insert_quarter insert_pair
1183 # elif ELEM_COUNT == 16
1184 #  define insert_eighth insert_pair
1185 # endif
1186 #endif
1187 
1188 #ifdef broadcast_quartet
1189 # if ELEM_COUNT == 8
1190 #  define broadcast_half broadcast_quartet
1191 # elif ELEM_COUNT == 16
1192 #  define broadcast_quarter broadcast_quartet
1193 # endif
1194 #endif
1195 
1196 #ifdef insert_quartet
1197 # if ELEM_COUNT == 8
1198 #  define insert_half insert_quartet
1199 # elif ELEM_COUNT == 16
1200 #  define insert_quarter insert_quartet
1201 # endif
1202 #endif
1203 
1204 #if defined(broadcast_octet) && ELEM_COUNT == 16
1205 # define broadcast_half broadcast_octet
1206 #endif
1207 
1208 #if defined(insert_octet) && ELEM_COUNT == 16
1209 # define insert_half insert_octet
1210 #endif
1211 
1212 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
1213 # include "simd-fma.c"
1214 #endif
1215 
simd_test(void)1216 int simd_test(void)
1217 {
1218     unsigned int i, j;
1219     vec_t x, y, z, src, inv, alt, sh;
1220     vint_t interleave_lo, interleave_hi;
1221 
1222     for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
1223     {
1224         src[i] = i + 1;
1225         inv[i] = ELEM_COUNT - i;
1226 #ifdef UINT_SIZE
1227         alt[i] = -!(i & 1);
1228 #else
1229         alt[i] = i & 1 ? -1 : 1;
1230 #endif
1231         if ( !(i & (i + 1)) )
1232             --j;
1233         sh[i] = j;
1234 
1235         interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1);
1236         interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2);
1237     }
1238 
1239     touch(src);
1240     x = src;
1241     touch(x);
1242     if ( !eq(x, src) ) return __LINE__;
1243 
1244     touch(src);
1245     y = x + src;
1246     touch(src);
1247     touch(y);
1248     if ( !eq(y, 2 * src) ) return __LINE__;
1249 
1250     touch(src);
1251     z = y -= src;
1252     touch(z);
1253     if ( !eq(x, z) ) return __LINE__;
1254 
1255 #if defined(UINT_SIZE)
1256 
1257     touch(inv);
1258     x |= inv;
1259     touch(inv);
1260     y &= inv;
1261     touch(inv);
1262     z ^= inv;
1263     touch(inv);
1264     touch(x);
1265     if ( !eq(x & ~y, z) ) return __LINE__;
1266 
1267 #elif ELEM_SIZE > 1 || VEC_SIZE <= 8
1268 
1269     touch(src);
1270     x *= src;
1271     y = inv * inv;
1272     touch(src);
1273     z = src + inv;
1274     touch(inv);
1275     z *= (src - inv);
1276     if ( !eq(x - y, z) ) return __LINE__;
1277 
1278 #endif
1279 
1280 #if defined(FLOAT_SIZE)
1281 
1282     x = src * alt;
1283     touch(alt);
1284     y = src / alt;
1285     if ( !eq(x, y) ) return __LINE__;
1286     touch(alt);
1287     touch(src);
1288     if ( !eq(x * -alt, -src) ) return __LINE__;
1289 
1290 # ifdef to_int
1291 
1292     touch(src);
1293     x = to_int(src);
1294     touch(src);
1295     if ( !eq(x, src) ) return __LINE__;
1296 
1297 #  ifdef recip
1298     touch(src);
1299     x = recip(src);
1300     touch(src);
1301     touch(x);
1302     if ( !eq(to_int(recip(x)), src) ) return __LINE__;
1303 
1304 #   ifdef rsqrt
1305     x = src * src;
1306     touch(x);
1307     y = rsqrt(x);
1308     touch(y);
1309     if ( !eq(to_int(recip(y)), src) ) return __LINE__;
1310     touch(src);
1311     if ( !eq(to_int(y), to_int(recip(src))) ) return __LINE__;
1312 #   endif
1313 #  endif
1314 
1315 # endif
1316 
1317 # ifdef to_wint
1318     touch(src);
1319     x = to_wint(src);
1320     touch(src);
1321     if ( !eq(x, src) ) return __LINE__;
1322 # endif
1323 
1324 # ifdef to_uint
1325     touch(src);
1326     x = to_uint(src);
1327     touch(src);
1328     if ( !eq(x, src) ) return __LINE__;
1329 # endif
1330 
1331 # ifdef to_uwint
1332     touch(src);
1333     x = to_uwint(src);
1334     touch(src);
1335     if ( !eq(x, src) ) return __LINE__;
1336 # endif
1337 
1338 # ifdef sqrt
1339     x = src * src;
1340     touch(x);
1341     if ( !eq(sqrt(x), src) ) return __LINE__;
1342 # endif
1343 
1344 # ifdef trunc
1345     x = 1 / src;
1346     y = (vec_t){ 1 };
1347     touch(x);
1348     z = trunc(x);
1349     if ( !eq(y, z) ) return __LINE__;
1350 # endif
1351 
1352 # ifdef frac
1353     touch(src);
1354     x = frac(src);
1355     touch(src);
1356     if ( !eq(x, (vec_t){}) ) return __LINE__;
1357 
1358     x = 1 / (src + 1);
1359     touch(x);
1360     y = frac(x);
1361     touch(x);
1362     if ( !eq(x, y) ) return __LINE__;
1363 # endif
1364 
1365 # if defined(trunc) && defined(frac)
1366     x = src / 4;
1367     touch(x);
1368     y = trunc(x);
1369     touch(x);
1370     z = frac(x);
1371     touch(x);
1372     if ( !eq(x, y + z) ) return __LINE__;
1373 # endif
1374 
1375 #else
1376 
1377 # if ELEM_SIZE > 1
1378 
1379     touch(inv);
1380     x = src * inv;
1381     touch(inv);
1382     y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
1383     for ( i = 1; i < ELEM_COUNT / 2; ++i )
1384         y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
1385     if ( !eq(x, y) ) return __LINE__;
1386 
1387 #  ifdef mul_hi
1388     touch(alt);
1389     x = mul_hi(src, alt);
1390     touch(alt);
1391 #   ifdef INT_SIZE
1392     if ( !eq(x, alt < 0) ) return __LINE__;
1393 #   else
1394     if ( !eq(x, (src & alt) + alt) ) return __LINE__;
1395 #   endif
1396 #  endif
1397 
1398 #  ifdef mul_full
1399     x = src ^ alt;
1400     touch(inv);
1401     y = mul_full(x, inv);
1402     touch(inv);
1403     for ( i = 0; i < ELEM_COUNT; i += 2 )
1404     {
1405         unsigned long long res = x[i] * 1ULL * inv[i];
1406 
1407         z[i] = res;
1408         z[i + 1] = res >> (ELEM_SIZE << 3);
1409     }
1410     if ( !eq(y, z) ) return __LINE__;
1411 #  endif
1412 
1413     z = src;
1414 #  ifdef INT_SIZE
1415     z *= alt;
1416 #  endif
1417     touch(z);
1418     x = z << 3;
1419     touch(z);
1420     y = z << 2;
1421     touch(z);
1422     if ( !eq(x, y + y) ) return __LINE__;
1423 
1424     touch(x);
1425     z = x >> 2;
1426     touch(x);
1427     if ( !eq(y, z + z) ) return __LINE__;
1428 
1429     z = src;
1430 #  ifdef INT_SIZE
1431     z *= alt;
1432 #  endif
1433     /*
1434      * Note that despite the touch()-es here there doesn't appear to be a way
1435      * to make the compiler use a memory operand for the shift instruction (at
1436      * least without resorting to built-ins).
1437      */
1438     j = 3;
1439     touch(j);
1440     x = z << j;
1441     touch(j);
1442     j = 2;
1443     touch(j);
1444     y = z << j;
1445     touch(j);
1446     if ( !eq(x, y + y) ) return __LINE__;
1447 
1448     z = x >> j;
1449     touch(j);
1450     if ( !eq(y, z + z) ) return __LINE__;
1451 
1452 # endif
1453 
1454 # if ELEM_SIZE == 2 || defined(__SSE4_1__)
1455     /*
1456      * Even when there are no instructions with varying shift counts per
1457      * field, the code turns out to be a nice exercise for pextr/pinsr.
1458      */
1459     z = src;
1460 #  ifdef INT_SIZE
1461     z *= alt;
1462 #  endif
1463     /*
1464      * Zap elements for which the shift count is zero (and the hence the
1465      * decrement below would yield a negative count.
1466      */
1467     z &= (sh > 0);
1468     touch(sh);
1469     x = z << sh;
1470     touch(sh);
1471     --sh;
1472     touch(sh);
1473     y = z << sh;
1474     if ( !eq(x, y + y) ) return __LINE__;
1475 
1476 #  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
1477     touch(sh);
1478     x = y >> sh;
1479     if ( !eq(x, z) ) return __LINE__;
1480 #  endif
1481 
1482 # endif
1483 
1484 #endif
1485 
1486 #if defined(max) && defined(min)
1487 # ifdef UINT_SIZE
1488     touch(inv);
1489     x = min(src, inv);
1490     touch(inv);
1491     y = max(src, inv);
1492     touch(inv);
1493     if ( !eq(x + y, src + inv) ) return __LINE__;
1494 # else
1495     x = src * alt;
1496     y = inv * alt;
1497     touch(y);
1498     z = max(x, y);
1499     touch(y);
1500     y = min(x, y);
1501     touch(y);
1502     if ( !eq((y + z) * alt, src + inv) ) return __LINE__;
1503 # endif
1504 #endif
1505 
1506 #ifdef abs
1507     x = src * alt;
1508     touch(x);
1509     if ( !eq(abs(x), src) ) return __LINE__;
1510 #endif
1511 
1512 #ifdef copysignz
1513     touch(alt);
1514     if ( !eq(copysignz((vec_t){} + 1, alt), alt) ) return __LINE__;
1515 #endif
1516 
1517 #ifdef swap
1518     touch(src);
1519     if ( !eq(swap(src), inv) ) return __LINE__;
1520 #endif
1521 
1522 #ifdef swap2
1523     touch(src);
1524     if ( !eq(swap2(src), inv) ) return __LINE__;
1525 #endif
1526 
1527 #ifdef swap3
1528     touch(src);
1529     if ( !eq(swap3(src), inv) ) return __LINE__;
1530     touch(src);
1531 #endif
1532 
1533 #ifdef broadcast
1534     if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
1535 #endif
1536 
1537 #ifdef broadcast2
1538     if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
1539 #endif
1540 
1541 #if defined(broadcast_half) && defined(insert_half)
1542     {
1543         half_t aux = low_half(src);
1544 
1545         touch(aux);
1546         x = broadcast_half(aux);
1547         touch(aux);
1548         y = insert_half(src, aux, 1);
1549         if ( !eq(x, y) ) return __LINE__;
1550     }
1551 #endif
1552 
1553 #if defined(broadcast_quarter) && defined(insert_quarter)
1554     {
1555         quarter_t aux = low_quarter(src);
1556 
1557         touch(aux);
1558         x = broadcast_quarter(aux);
1559         touch(aux);
1560         y = insert_quarter(src, aux, 1);
1561         touch(aux);
1562         y = insert_quarter(y, aux, 2);
1563         touch(aux);
1564         y = insert_quarter(y, aux, 3);
1565         if ( !eq(x, y) ) return __LINE__;
1566     }
1567 #endif
1568 
1569 #if defined(broadcast_eighth) && defined(insert_eighth) && \
1570     /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
1571     __GNUC__ >= 8
1572     {
1573         eighth_t aux = low_eighth(src);
1574 
1575         touch(aux);
1576         x = broadcast_eighth(aux);
1577         touch(aux);
1578         y = insert_eighth(src, aux, 1);
1579         touch(aux);
1580         y = insert_eighth(y, aux, 2);
1581         touch(aux);
1582         y = insert_eighth(y, aux, 3);
1583         touch(aux);
1584         y = insert_eighth(y, aux, 4);
1585         touch(aux);
1586         y = insert_eighth(y, aux, 5);
1587         touch(aux);
1588         y = insert_eighth(y, aux, 6);
1589         touch(aux);
1590         y = insert_eighth(y, aux, 7);
1591         if ( !eq(x, y) ) return __LINE__;
1592     }
1593 #endif
1594 
1595 #if defined(interleave_lo) && defined(interleave_hi)
1596     touch(src);
1597     x = interleave_lo(inv, src);
1598     touch(src);
1599     y = interleave_hi(inv, src);
1600     touch(src);
1601 # ifdef UINT_SIZE
1602     z = ((x - y) ^ ~alt) - ~alt;
1603 # else
1604     z = (x - y) * alt;
1605 # endif
1606 # ifdef broadcast
1607     if ( !eq(z, broadcast(ELEM_COUNT / 2)) ) return __LINE__;
1608 # else
1609     if ( !eq(z, ELEM_COUNT / 2) ) return __LINE__;
1610 # endif
1611 #endif
1612 
1613 #if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
1614 
1615     x = src * alt;
1616     y = interleave_lo(x, alt < 0);
1617     touch(x);
1618     z = widen1(low_half(x));
1619     touch(x);
1620     if ( !eq(z, y) ) return __LINE__;
1621 
1622 # ifdef widen2
1623     y = interleave_lo(alt < 0, alt < 0);
1624     y = interleave_lo(z, y);
1625     touch(x);
1626     z = widen2(low_quarter(x));
1627     touch(x);
1628     if ( !eq(z, y) ) return __LINE__;
1629 
1630 #  ifdef widen3
1631     y = interleave_lo(alt < 0, alt < 0);
1632     y = interleave_lo(y, y);
1633     y = interleave_lo(z, y);
1634     touch(x);
1635     z = widen3(low_eighth(x));
1636     touch(x);
1637     if ( !eq(z, y) ) return __LINE__;
1638 #  endif
1639 # endif
1640 
1641 #endif
1642 
1643 #if defined(UINT_SIZE) && defined(interleave_lo)
1644 
1645     y = interleave_lo(src, (vec_t){});
1646     z = interleave_lo(y, (vec_t){});
1647 
1648 # ifdef widen1
1649     touch(src);
1650     x = widen1(low_half(src));
1651     touch(src);
1652     if ( !eq(x, y) ) return __LINE__;
1653 # endif
1654 
1655 # ifdef widen2
1656     touch(src);
1657     x = widen2(low_quarter(src));
1658     touch(src);
1659     if ( !eq(x, z) ) return __LINE__;
1660 # endif
1661 
1662 # ifdef widen3
1663     touch(src);
1664     x = widen3(low_eighth(src));
1665     touch(src);
1666     if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
1667 # endif
1668 
1669 #endif
1670 
1671 #if defined(widen1) && defined(shrink1)
1672     {
1673         half_t aux1 = low_half(src), aux2;
1674 
1675         touch(aux1);
1676         x = widen1(aux1);
1677         touch(x);
1678         aux2 = shrink1(x);
1679         touch(aux2);
1680         for ( i = 0; i < ELEM_COUNT / 2; ++i )
1681             if ( aux2[i] != src[i] )
1682                 return __LINE__;
1683     }
1684 #endif
1685 
1686 #if defined(widen2) && defined(shrink2)
1687     {
1688         quarter_t aux1 = low_quarter(src), aux2;
1689 
1690         touch(aux1);
1691         x = widen2(aux1);
1692         touch(x);
1693         aux2 = shrink2(x);
1694         touch(aux2);
1695         for ( i = 0; i < ELEM_COUNT / 4; ++i )
1696             if ( aux2[i] != src[i] )
1697                 return __LINE__;
1698     }
1699 #endif
1700 
1701 #if defined(widen3) && defined(shrink3)
1702     {
1703         eighth_t aux1 = low_eighth(src), aux2;
1704 
1705         touch(aux1);
1706         x = widen3(aux1);
1707         touch(x);
1708         aux2 = shrink3(x);
1709         touch(aux2);
1710         for ( i = 0; i < ELEM_COUNT / 8; ++i )
1711             if ( aux2[i] != src[i] )
1712                 return __LINE__;
1713     }
1714 #endif
1715 
1716 #ifdef dup_lo
1717     touch(src);
1718     x = dup_lo(src);
1719     touch(src);
1720     if ( !eq(x - src, (alt - 1) / 2) ) return __LINE__;
1721 #endif
1722 
1723 #ifdef dup_hi
1724     touch(src);
1725     x = dup_hi(src);
1726     touch(src);
1727     if ( !eq(x - src, (alt + 1) / 2) ) return __LINE__;
1728 #endif
1729 
1730     for ( i = 0; i < ELEM_COUNT; ++i )
1731         y[i] = (i & 1 ? inv : src)[i];
1732 
1733 #ifdef select
1734 # ifdef UINT_SIZE
1735     select(&z, src, inv, alt);
1736 # else
1737     select(&z, src, inv, alt > 0);
1738 # endif
1739     if ( !eq(z, y) ) return __LINE__;
1740 #endif
1741 
1742 #ifdef select2
1743 # ifdef UINT_SIZE
1744     select2(&z, src, inv, alt);
1745 # else
1746     select2(&z, src, inv, alt > 0);
1747 # endif
1748     if ( !eq(z, y) ) return __LINE__;
1749 #endif
1750 
1751 #ifdef mix
1752     touch(src);
1753     touch(inv);
1754     x = mix(src, inv);
1755     if ( !eq(x, y) ) return __LINE__;
1756 
1757 # ifdef addsub
1758     touch(src);
1759     touch(inv);
1760     x = addsub(src, inv);
1761     touch(src);
1762     touch(inv);
1763     y = mix(src - inv, src + inv);
1764     if ( !eq(x, y) ) return __LINE__;
1765 # endif
1766 #endif
1767 
1768 #ifdef rotr
1769     x = rotr(src, 1);
1770     y = (src & (ELEM_COUNT - 1)) + 1;
1771     if ( !eq(x, y) ) return __LINE__;
1772 #endif
1773 
1774 #ifdef dot_product
1775     touch(src);
1776     touch(inv);
1777     x = dot_product(src, inv);
1778     if ( !eq(x, (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
1779                           (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
1780 #endif
1781 
1782 #ifdef hadd
1783 # if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
1784      (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
1785     x = src;
1786     for ( i = ELEM_COUNT; i >>= 1; )
1787     {
1788         touch(x);
1789         x = hadd((vec_t){}, x);
1790     }
1791     if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
1792 # endif
1793 
1794 # ifdef hsub
1795     touch(src);
1796     touch(inv);
1797     x = hsub(src, inv);
1798     for ( i = ELEM_COUNT; i >>= 1; )
1799         x = hadd(x, (vec_t){});
1800     if ( !eq(x, (vec_t){}) ) return __LINE__;
1801 # endif
1802 #endif
1803 
1804 #if defined(getexp) && defined(getmant)
1805     touch(src);
1806     x = getmant(src);
1807     touch(src);
1808     y = getexp(src);
1809     touch(src);
1810     for ( j = i = 0; i < ELEM_COUNT; ++i )
1811     {
1812         if ( y[i] != j ) return __LINE__;
1813 
1814         if ( !((i + 1) & (i + 2)) )
1815             ++j;
1816 
1817         if ( !(i & (i + 1)) && x[i] != 1 ) return __LINE__;
1818     }
1819 # ifdef scale
1820     touch(y);
1821     z = scale(x, y);
1822     if ( !eq(src, z) ) return __LINE__;
1823 # endif
1824 #endif
1825 
1826 #if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
1827     (defined(__AVX512F__) && defined(FLOAT_SIZE))
1828     return -fma_test();
1829 #endif
1830 
1831     return 0;
1832 }
1833