1 #include "simd.h"
2
3 ENTRY(simd_test);
4
5 #if defined(__AVX512F__)
6 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
7 # if VEC_SIZE == 4
8 # define eq(x, y) ({ \
9 float x_ = (x)[0]; \
10 float __attribute__((vector_size(16))) y_ = { (y)[0] }; \
11 unsigned short r_; \
12 asm ( "vcmpss $0, %1, %2, %0" : "=k" (r_) : "m" (x_), "v" (y_) ); \
13 r_ == 1; \
14 })
15 # elif VEC_SIZE == 8
16 # define eq(x, y) ({ \
17 double x_ = (x)[0]; \
18 double __attribute__((vector_size(16))) y_ = { (y)[0] }; \
19 unsigned short r_; \
20 asm ( "vcmpsd $0, %1, %2, %0" : "=k" (r_) : "m" (x_), "v" (y_) ); \
21 r_ == 1; \
22 })
23 # elif FLOAT_SIZE == 4
24 /*
25 * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
26 * that its return type is QI rather than UQI, and hence the value would get
27 * sign-extended before comapring to ALL_TRUE. The same oddity does not matter
28 * for __builtin_ia32_cmppd256_mask(), as there only 4 bits are significant.
29 * Hence the extra " & ALL_TRUE".
30 */
31 # define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
32 # elif FLOAT_SIZE == 8
33 # define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
34 # elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
35 # define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
36 # elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
37 # define eq(x, y) (B(pcmpeqw, _mask, (vhi_t)(x), (vhi_t)(y), -1) == ALL_TRUE)
38 # elif INT_SIZE == 4 || UINT_SIZE == 4
39 # define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
40 # elif INT_SIZE == 8 || UINT_SIZE == 8
41 # define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
42 # endif
43 #elif VEC_SIZE == 8 && defined(__SSE__)
44 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
45 #elif VEC_SIZE == 16
46 # if defined(__AVX__) && defined(FLOAT_SIZE)
47 # if ELEM_SIZE == 4
48 # define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
49 # elif ELEM_SIZE == 8
50 # define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
51 # endif
52 # elif defined(__SSE4_1__)
53 # define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
54 # elif defined(__SSE__) && ELEM_SIZE == 4
55 # define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
56 # elif defined(__SSE2__)
57 # if ELEM_SIZE == 8
58 # define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
59 # else
60 # define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
61 # endif
62 # endif
63 #elif VEC_SIZE == 32
64 # if defined(__AVX2__)
65 # define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
66 # elif defined(__AVX__) && ELEM_SIZE == 4
67 # define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
68 # elif defined(__AVX__) && ELEM_SIZE == 8
69 # define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
70 # endif
71 #endif
72
73 #ifndef to_bool
_to_bool(byte_vec_t bv)74 static inline bool _to_bool(byte_vec_t bv)
75 {
76 unsigned int i;
77
78 for ( i = 0; i < VEC_SIZE; ++i )
79 if ( bv[i] != 0xff )
80 return false;
81
82 return true;
83 }
84 # define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
85 #endif
86
87 #ifndef eq
88 # define eq(x, y) to_bool((x) == (y))
89 #endif
90
91 #if VEC_SIZE == FLOAT_SIZE
92 # define to_int(x) ({ int i_ = (x)[0]; touch(i_); ((vec_t){ i_ }); })
93 # ifdef __x86_64__
94 # define to_wint(x) ({ long l_ = (x)[0]; touch(l_); ((vec_t){ l_ }); })
95 # endif
96 # ifdef __AVX512F__
97 /*
98 * Sadly even gcc 9.x, at the time of writing, does not carry out at least
99 * uint -> FP conversions using VCVTUSI2S{S,D}, so we need to use builtins
100 * or inline assembly here. The full-vector parameter types of the builtins
101 * aren't very helpful for our purposes, so use inline assembly.
102 */
103 # if FLOAT_SIZE == 4
104 # define to_u_int(type, x) ({ \
105 unsigned type u_; \
106 float __attribute__((vector_size(16))) t_; \
107 asm ( "vcvtss2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
108 asm ( "vcvtusi2ss%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
109 (vec_t){ t_[0] }; \
110 })
111 # elif FLOAT_SIZE == 8
112 # define to_u_int(type, x) ({ \
113 unsigned type u_; \
114 double __attribute__((vector_size(16))) t_; \
115 asm ( "vcvtsd2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
116 asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
117 (vec_t){ t_[0] }; \
118 })
119 # endif
120 # define to_uint(x) to_u_int(int, x)
121 # ifdef __x86_64__
122 # define to_uwint(x) to_u_int(long, x)
123 # endif
124 # endif
125 #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
126 # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
127 #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
128 (VEC_SIZE == 64 || defined(__AVX512VL__))
129 # if FLOAT_SIZE == 4
130 # define to_int(x) BR(cvtdq2ps, _mask, BR(cvtps2dq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
131 # define to_uint(x) BR(cvtudq2ps, _mask, BR(cvtps2udq, _mask, x, (vsi_t)undef(), ~0), undef(), ~0)
132 # ifdef __AVX512DQ__
133 # define to_w_int(x, s) ({ \
134 vsf_half_t t_ = low_half(x); \
135 vdi_t lo_, hi_; \
136 touch(t_); \
137 lo_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \
138 t_ = high_half(x); \
139 touch(t_); \
140 hi_ = BR(cvtps2 ## s ## qq, _mask, t_, (vdi_t)undef(), ~0); \
141 touch(lo_); touch(hi_); \
142 insert_half(insert_half(undef(), \
143 BR(cvt ## s ## qq2ps, _mask, lo_, (vsf_half_t){}, ~0), 0), \
144 BR(cvt ## s ## qq2ps, _mask, hi_, (vsf_half_t){}, ~0), 1); \
145 })
146 # define to_wint(x) to_w_int(x, )
147 # define to_uwint(x) to_w_int(x, u)
148 # endif
149 # elif FLOAT_SIZE == 8
150 # define to_int(x) B(cvtdq2pd, _mask, BR(cvtpd2dq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
151 # define to_uint(x) B(cvtudq2pd, _mask, BR(cvtpd2udq, _mask, x, (vsi_half_t){}, ~0), undef(), ~0)
152 # ifdef __AVX512DQ__
153 # define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
154 # define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
155 # endif
156 # endif
157 #elif VEC_SIZE == 16 && defined(__SSE2__)
158 # if FLOAT_SIZE == 4
159 # define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
160 # elif FLOAT_SIZE == 8
161 # define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
162 # endif
163 #elif VEC_SIZE == 32 && defined(__AVX__)
164 # if FLOAT_SIZE == 4
165 # define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
166 # elif FLOAT_SIZE == 8
167 # define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
168 # endif
169 #endif
170
171 #if VEC_SIZE == FLOAT_SIZE
172 # define scalar_1op(x, op) ({ \
173 typeof((x)[0]) __attribute__((vector_size(16))) r_; \
174 asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
175 (vec_t){ r_[0] }; \
176 })
177 # define scalar_2op(x, y, op) ({ \
178 typeof((x)[0]) __attribute__((vector_size(16))) r_ = { x[0] }; \
179 asm ( op : [out] "=&x" (r_) : [in1] "[out]" (r_), [in2] "m" (y) ); \
180 (vec_t){ r_[0] }; \
181 })
182 #endif
183
184 #if VEC_SIZE == 16 && FLOAT_SIZE == 4 && defined(__SSE__)
185 # define low_half(x) (x)
186 # define high_half(x) B_(movhlps, , undef(), x)
187 /*
188 * GCC 7 (and perhaps earlier) report a bogus type mismatch for the conditional
189 * expression below. All works well with this no-op wrapper.
190 */
movlhps(vec_t x,vec_t y)191 static inline vec_t movlhps(vec_t x, vec_t y) {
192 return __builtin_ia32_movlhps(x, y);
193 }
194 # define insert_pair(x, y, p) \
195 ((p) ? movlhps(x, y) \
196 : ({ vec_t t_ = (x); t_[0] = (y)[0]; t_[1] = (y)[1]; t_; }))
197 #endif
198
199 #if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
200 # define max __builtin_ia32_pfmax
201 # define min __builtin_ia32_pfmin
202 # define recip(x) ({ \
203 vec_t t_ = __builtin_ia32_pfrcp(x); \
204 touch(x); \
205 t_[1] = __builtin_ia32_pfrcp(__builtin_ia32_pswapdsf(x))[0]; \
206 touch(x); \
207 __builtin_ia32_pfrcpit2(__builtin_ia32_pfrcpit1(t_, x), t_); \
208 })
209 # define rsqrt(x) ({ \
210 vec_t t_ = __builtin_ia32_pfrsqrt(x); \
211 touch(x); \
212 t_[1] = __builtin_ia32_pfrsqrt(__builtin_ia32_pswapdsf(x))[0]; \
213 touch(x); \
214 __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
215 })
216 #elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
217 # if FLOAT_SIZE == 4
218 # define getexp(x) scalar_1op(x, "vgetexpss %[in], %[out], %[out]")
219 # define getmant(x) scalar_1op(x, "vgetmantss $0, %[in], %[out], %[out]")
220 # ifdef __AVX512ER__
221 # define recip(x) scalar_1op(x, "vrcp28ss %[in], %[out], %[out]")
222 # define rsqrt(x) scalar_1op(x, "vrsqrt28ss %[in], %[out], %[out]")
223 # else
224 # define recip(x) scalar_1op(x, "vrcp14ss %[in], %[out], %[out]")
225 # define rsqrt(x) scalar_1op(x, "vrsqrt14ss %[in], %[out], %[out]")
226 # endif
227 # define scale(x, y) scalar_2op(x, y, "vscalefss %[in2], %[in1], %[out]")
228 # define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
229 # define trunc(x) scalar_1op(x, "vrndscaless $0b1011, %[in], %[out], %[out]")
230 # elif FLOAT_SIZE == 8
231 # define getexp(x) scalar_1op(x, "vgetexpsd %[in], %[out], %[out]")
232 # define getmant(x) scalar_1op(x, "vgetmantsd $0, %[in], %[out], %[out]")
233 # ifdef __AVX512ER__
234 # define recip(x) scalar_1op(x, "vrcp28sd %[in], %[out], %[out]")
235 # define rsqrt(x) scalar_1op(x, "vrsqrt28sd %[in], %[out], %[out]")
236 # else
237 # define recip(x) scalar_1op(x, "vrcp14sd %[in], %[out], %[out]")
238 # define rsqrt(x) scalar_1op(x, "vrsqrt14sd %[in], %[out], %[out]")
239 # endif
240 # define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]")
241 # define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
242 # define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
243 # endif
244 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
245 (VEC_SIZE == 64 || defined(__AVX512VL__))
246 # if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
247 (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
248 (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
249 # define _half(x, lh) ({ \
250 half_t t_; \
251 asm ( "vextractf%c[w]x%c[n] %[sel], %[s], %[d]" \
252 : [d] "=m" (t_) \
253 : [s] "v" (x), [sel] "i" (lh), \
254 [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
255 t_; \
256 })
257 # define low_half(x) _half(x, 0)
258 # define high_half(x) _half(x, 1)
259 # endif
260 # if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
261 (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
262 # define low_quarter(x) ({ \
263 quarter_t t_; \
264 asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
265 : [d] "=m" (t_) \
266 : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
267 t_; \
268 })
269 # endif
270 # if FLOAT_SIZE == 4
271 # define broadcast(x) ({ \
272 vec_t t_; \
273 asm ( "%{evex%} vbroadcastss %1, %0" \
274 : "=v" (t_) : "m" (*(float[1]){ x }) ); \
275 t_; \
276 })
277 # if VEC_SIZE >= 32 && defined(__AVX512DQ__)
278 # define broadcast_pair(x) ({ \
279 vec_t t_; \
280 asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
281 t_; \
282 })
283 # endif
284 # if VEC_SIZE == 64 && defined(__AVX512DQ__)
285 # define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
286 # define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
287 # endif
288 # ifdef __AVX512DQ__
289 # define frac(x) B(reduceps, _mask, x, 0b00001011, undef(), ~0)
290 # endif
291 # define getexp(x) BR(getexpps, _mask, x, undef(), ~0)
292 # define getmant(x) BR(getmantps, _mask, x, 0, undef(), ~0)
293 # ifdef __AVX512DQ__
294 # define max(x, y) BR(rangeps, _mask, x, y, 0b0101, undef(), ~0)
295 # define min(x, y) BR(rangeps, _mask, x, y, 0b0100, undef(), ~0)
296 # else
297 # define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
298 # define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
299 # endif
300 # define mix(x, y) B(blendmps_, _mask, x, y, (0b1010101010101010 & ALL_TRUE))
301 # define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0)
302 # if VEC_SIZE == 64 && defined(__AVX512ER__)
303 # define recip(x) BR(rcp28ps, _mask, x, undef(), ~0)
304 # define rsqrt(x) BR(rsqrt28ps, _mask, x, undef(), ~0)
305 # else
306 # define recip(x) B(rcp14ps, _mask, x, undef(), ~0)
307 # define rsqrt(x) B(rsqrt14ps, _mask, x, undef(), ~0)
308 # endif
309 # define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
310 # define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
311 # define trunc(x) BR(rndscaleps_, _mask, x, 0b1011, undef(), ~0)
312 # define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
313 # if VEC_SIZE == 16
314 # define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
315 # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
316 # define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
317 # define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0)
318 # else
319 # define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
320 # define insert_pair(x, y, p) \
321 B(insertf32x4_, _mask, x, \
322 /* Cast needed below to work around gcc 7.x quirk. */ \
323 (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
324 (p) >> 1, x, 3 << ((p) * 2))
325 # define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
326 # define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
327 # define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
328 # define swap(x) ({ \
329 vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
330 B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
331 })
332 # define swap2(x) B(vpermilps, _mask, \
333 B(shuf_f32x4_, _mask, x, x, \
334 VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
335 0b00011011, undef(), ~0)
336 # endif
337 # elif FLOAT_SIZE == 8
338 # if VEC_SIZE >= 32
339 # define broadcast(x) ({ \
340 vec_t t_; \
341 asm ( "%{evex%} vbroadcastsd %1, %0" : "=v" (t_) \
342 : "m" (*(double[1]){ x }) ); \
343 t_; \
344 })
345 # else
346 # define broadcast(x) ({ \
347 vec_t t_; \
348 asm ( "%{evex%} vpbroadcastq %1, %0" \
349 : "=v" (t_) : "m" (*(double[1]){ x }) ); \
350 t_; \
351 })
352 # endif
353 # if VEC_SIZE >= 32 && defined(__AVX512DQ__)
354 # define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
355 # define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
356 # endif
357 # if VEC_SIZE == 64
358 # define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
359 # define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
360 # endif
361 # ifdef __AVX512DQ__
362 # define frac(x) B(reducepd, _mask, x, 0b00001011, undef(), ~0)
363 # endif
364 # define getexp(x) BR(getexppd, _mask, x, undef(), ~0)
365 # define getmant(x) BR(getmantpd, _mask, x, 0, undef(), ~0)
366 # ifdef __AVX512DQ__
367 # define max(x, y) BR(rangepd, _mask, x, y, 0b0101, undef(), ~0)
368 # define min(x, y) BR(rangepd, _mask, x, y, 0b0100, undef(), ~0)
369 # else
370 # define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
371 # define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
372 # endif
373 # define mix(x, y) B(blendmpd_, _mask, x, y, 0b10101010)
374 # define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0)
375 # if VEC_SIZE == 64 && defined(__AVX512ER__)
376 # define recip(x) BR(rcp28pd, _mask, x, undef(), ~0)
377 # define rsqrt(x) BR(rsqrt28pd, _mask, x, undef(), ~0)
378 # else
379 # define recip(x) B(rcp14pd, _mask, x, undef(), ~0)
380 # define rsqrt(x) B(rsqrt14pd, _mask, x, undef(), ~0)
381 # endif
382 # define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0)
383 # define trunc(x) BR(rndscalepd_, _mask, x, 0b1011, undef(), ~0)
384 # if VEC_SIZE == 16
385 # define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
386 # define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
387 # define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
388 # define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0)
389 # else
390 # define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
391 # define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
392 # define swap(x) ({ \
393 vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
394 B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
395 })
396 # define swap2(x) B(vpermilpd, _mask, \
397 B(shuf_f64x2_, _mask, x, x, \
398 VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
399 0b01010101, undef(), ~0)
400 # endif
401 # endif
402 #elif FLOAT_SIZE == 4 && defined(__SSE__)
403 # if VEC_SIZE == 32 && defined(__AVX__)
404 # if defined(__AVX2__)
405 # define broadcast(x) \
406 __builtin_ia32_vbroadcastss_ps256((float __attribute__((vector_size(16)))){ x })
407 # else
408 # define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
409 # endif
410 # define max(x, y) __builtin_ia32_maxps256(x, y)
411 # define min(x, y) __builtin_ia32_minps256(x, y)
412 # define recip(x) __builtin_ia32_rcpps256(x)
413 # define rsqrt(x) __builtin_ia32_rsqrtps256(x)
414 # define sqrt(x) __builtin_ia32_sqrtps256(x)
415 # define swap(x) ({ \
416 vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
417 __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
418 })
419 # ifdef __AVX2__
420 # define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
421 # else
422 # define swap2(x) ({ \
423 vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
424 __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
425 })
426 # endif
427 # elif VEC_SIZE == 16
428 # if defined(__AVX2__)
429 # define broadcast(x) __builtin_ia32_vbroadcastss_ps((vec_t){ x })
430 # elif defined(__AVX__)
431 # define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
432 # endif
433 # define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
434 # define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
435 # define max(x, y) __builtin_ia32_maxps(x, y)
436 # define min(x, y) __builtin_ia32_minps(x, y)
437 # define recip(x) __builtin_ia32_rcpps(x)
438 # define rsqrt(x) __builtin_ia32_rsqrtps(x)
439 # define sqrt(x) __builtin_ia32_sqrtps(x)
440 # define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
441 # ifdef __AVX__
442 # define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
443 # endif
444 # elif VEC_SIZE == 4
445 # define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
446 # define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
447 # define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
448 # endif
449 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
450 # if VEC_SIZE == 32 && defined(__AVX__)
451 # if defined(__AVX2__)
452 # define broadcast(x) \
453 __builtin_ia32_vbroadcastsd_pd256((double __attribute__((vector_size(16)))){ x })
454 # else
455 # define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
456 # endif
457 # define max(x, y) __builtin_ia32_maxpd256(x, y)
458 # define min(x, y) __builtin_ia32_minpd256(x, y)
459 # define recip(x) ({ \
460 float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
461 t_ = __builtin_ia32_vextractf128_ps256( \
462 __builtin_ia32_rcpps256( \
463 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
464 __builtin_ia32_cvtps2pd256(t_); \
465 })
466 # define rsqrt(x) ({ \
467 float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
468 float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
469 t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
470 t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
471 __builtin_ia32_cvtps2pd256(t1_); \
472 })
473 # define sqrt(x) __builtin_ia32_sqrtpd256(x)
474 # define swap(x) ({ \
475 vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
476 __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
477 })
478 # ifdef __AVX2__
479 # define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
480 # endif
481 # elif VEC_SIZE == 16
482 # define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
483 # define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
484 # define max(x, y) __builtin_ia32_maxpd(x, y)
485 # define min(x, y) __builtin_ia32_minpd(x, y)
486 # define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
487 # define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
488 # define sqrt(x) __builtin_ia32_sqrtpd(x)
489 # define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
490 # ifdef __AVX__
491 # define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
492 __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
493 # endif
494 # elif VEC_SIZE == 8
495 # define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
496 # define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
497 # define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
498 # endif
499 #endif
500 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
501 defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
502 # if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
503 (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \
504 (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
505 # define low_half(x) ({ \
506 half_t t_; \
507 asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
508 : [d] "=m" (t_) \
509 : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
510 t_; \
511 })
512 # endif
513 # if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
514 (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
515 # define low_quarter(x) ({ \
516 quarter_t t_; \
517 asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
518 : [d] "=m" (t_) \
519 : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
520 t_; \
521 })
522 # endif
523 # if INT_SIZE == 4 || UINT_SIZE == 4
524 # define broadcast(x) ({ \
525 vec_t t_; \
526 asm ( "%{evex%} vpbroadcastd %1, %0" \
527 : "=v" (t_) : "m" (*(int[1]){ x }) ); \
528 t_; \
529 })
530 # define broadcast2(x) ({ \
531 vec_t t_; \
532 asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
533 t_; \
534 })
535 # ifdef __AVX512DQ__
536 # define broadcast_pair(x) ({ \
537 vec_t t_; \
538 asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
539 t_; \
540 })
541 # endif
542 # if VEC_SIZE == 64 && defined(__AVX512DQ__)
543 # define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0))
544 # define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
545 # endif
546 # if VEC_SIZE == 16
547 # define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
548 # define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
549 # define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
550 # else
551 # define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0))
552 # define insert_pair(x, y, p) \
553 (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
554 /* First cast needed below to work around gcc 7.x quirk. */ \
555 (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \
556 : (vsi_pair_t)(y), \
557 (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
558 # define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
559 # define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
560 # define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
561 # define swap(x) ((vec_t)B(pshufd, _mask, \
562 B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
563 VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
564 0b00011011, (vsi_t)undef(), ~0))
565 # define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
566 # endif
567 # define mix(x, y) ((vec_t)B(blendmd_, _mask, (vsi_t)(x), (vsi_t)(y), \
568 (0b1010101010101010 & ((1 << ELEM_COUNT) - 1))))
569 # define rotr(x, n) ((vec_t)B(alignd, _mask, (vsi_t)(x), (vsi_t)(x), n, (vsi_t)undef(), ~0))
570 # define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
571 # elif INT_SIZE == 8 || UINT_SIZE == 8
572 # define broadcast(x) ({ \
573 vec_t t_; \
574 asm ( "%{evex%} vpbroadcastq %1, %0" \
575 : "=v" (t_) : "m" (*(long long[1]){ x }) ); \
576 t_; \
577 })
578 # ifdef __x86_64__
579 # define broadcast2(x) ({ \
580 vec_t t_; \
581 asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \
582 t_; \
583 })
584 # endif
585 # if VEC_SIZE >= 32 && defined(__AVX512DQ__)
586 # define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0))
587 # define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
588 # endif
589 # if VEC_SIZE == 64
590 # define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0))
591 # define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
592 # endif
593 # if VEC_SIZE == 16
594 # define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
595 # define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
596 # define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, (vsi_t)undef(), ~0))
597 # else
598 # define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
599 # define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
600 # define swap(x) ((vec_t)B(pshufd, _mask, \
601 (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
602 VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
603 0b01001110, (vsi_t)undef(), ~0))
604 # define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
605 # endif
606 # define mix(x, y) ((vec_t)B(blendmq_, _mask, (vdi_t)(x), (vdi_t)(y), 0b10101010))
607 # define rotr(x, n) ((vec_t)B(alignq, _mask, (vdi_t)(x), (vdi_t)(x), n, (vdi_t)undef(), ~0))
608 # if VEC_SIZE == 32
609 # define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
610 # elif VEC_SIZE == 64
611 # define swap3(x) ({ \
612 vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \
613 B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \
614 })
615 # endif
616 # endif
617 # if INT_SIZE == 4
618 # define abs(x) B(pabsd, _mask, x, undef(), ~0)
619 # define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
620 # define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
621 # define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
622 # define widen1(x) ((vec_t)B(pmovsxdq, _mask, x, (vdi_t)undef(), ~0))
623 # elif UINT_SIZE == 4
624 # define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
625 # define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
626 # define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
627 # define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), (vdi_t)undef(), ~0))
628 # elif INT_SIZE == 8
629 # define abs(x) ((vec_t)B(pabsq, _mask, (vdi_t)(x), (vdi_t)undef(), ~0))
630 # define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
631 # define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
632 # elif UINT_SIZE == 8
633 # define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
634 # define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
635 # endif
636 #elif (INT_SIZE == 1 || UINT_SIZE == 1 || INT_SIZE == 2 || UINT_SIZE == 2) && \
637 defined(__AVX512BW__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
638 # if INT_SIZE == 1 || UINT_SIZE == 1
639 # define broadcast(x) ({ \
640 vec_t t_; \
641 asm ( "%{evex%} vpbroadcastb %1, %0" \
642 : "=v" (t_) : "m" (*(char[1]){ x }) ); \
643 t_; \
644 })
645 # define broadcast2(x) ({ \
646 vec_t t_; \
647 asm ( "vpbroadcastb %k1, %0" : "=v" (t_) : "r" (x) ); \
648 t_; \
649 })
650 # if VEC_SIZE == 16
651 # define interleave_hi(x, y) ((vec_t)B(punpckhbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
652 # define interleave_lo(x, y) ((vec_t)B(punpcklbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
653 # define rotr(x, n) ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), (n) * 8, (vdi_t)undef(), ~0))
654 # define swap(x) ((vec_t)B(pshufb, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
655 # elif defined(__AVX512VBMI__)
656 # define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
657 # define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
658 # endif
659 # define mix(x, y) ((vec_t)B(blendmb_, _mask, (vqi_t)(x), (vqi_t)(y), \
660 (0b1010101010101010101010101010101010101010101010101010101010101010LL & ALL_TRUE)))
661 # define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
662 # define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
663 # define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
664 # ifdef __AVX512VBMI__
665 # define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
666 # endif
667 # elif INT_SIZE == 2 || UINT_SIZE == 2
668 # define broadcast(x) ({ \
669 vec_t t_; \
670 asm ( "%{evex%} vpbroadcastw %1, %0" \
671 : "=v" (t_) : "m" (*(short[1]){ x }) ); \
672 t_; \
673 })
674 # define broadcast2(x) ({ \
675 vec_t t_; \
676 asm ( "vpbroadcastw %k1, %0" : "=v" (t_) : "r" (x) ); \
677 t_; \
678 })
679 # if VEC_SIZE == 16
680 # define interleave_hi(x, y) ((vec_t)B(punpckhwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
681 # define interleave_lo(x, y) ((vec_t)B(punpcklwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
682 # define rotr(x, n) ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), (n) * 16, (vdi_t)undef(), ~0))
683 # define swap(x) ((vec_t)B(pshufd, _mask, \
684 (vsi_t)B(pshufhw, _mask, \
685 B(pshuflw, _mask, (vhi_t)(x), 0b00011011, (vhi_t)undef(), ~0), \
686 0b00011011, (vhi_t)undef(), ~0), \
687 0b01001110, (vsi_t)undef(), ~0))
688 # else
689 # define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
690 # define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
691 # endif
692 # define mix(x, y) ((vec_t)B(blendmw_, _mask, (vhi_t)(x), (vhi_t)(y), \
693 (0b10101010101010101010101010101010 & ALL_TRUE)))
694 # define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
695 # define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
696 # define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
697 # endif
698 # if INT_SIZE == 1
699 # define abs(x) ((vec_t)B(pabsb, _mask, (vqi_t)(x), (vqi_t)undef(), ~0))
700 # define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
701 # define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
702 # define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
703 # define widen2(x) ((vec_t)B(pmovsxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
704 # define widen3(x) ((vec_t)B(pmovsxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
705 # elif UINT_SIZE == 1
706 # define max(x, y) ((vec_t)B(pmaxub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
707 # define min(x, y) ((vec_t)B(pminub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
708 # define widen1(x) ((vec_t)B(pmovzxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
709 # define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
710 # define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
711 # elif INT_SIZE == 2
712 # define abs(x) B(pabsw, _mask, x, undef(), ~0)
713 # define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
714 # define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
715 # define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
716 # define widen1(x) ((vec_t)B(pmovsxwd, _mask, x, (vsi_t)undef(), ~0))
717 # define widen2(x) ((vec_t)B(pmovsxwq, _mask, x, (vdi_t)undef(), ~0))
718 # elif UINT_SIZE == 2
719 # define max(x, y) ((vec_t)B(pmaxuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
720 # define min(x, y) ((vec_t)B(pminuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
721 # define mul_hi(x, y) ((vec_t)B(pmulhuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
722 # define widen1(x) ((vec_t)B(pmovzxwd, _mask, (vhi_half_t)(x), (vsi_t)undef(), ~0))
723 # define widen2(x) ((vec_t)B(pmovzxwq, _mask, (vhi_quarter_t)(x), (vdi_t)undef(), ~0))
724 # endif
725 #elif VEC_SIZE == 16 && defined(__SSE2__)
726 # if INT_SIZE == 1 || UINT_SIZE == 1
727 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
728 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
729 # elif INT_SIZE == 2 || UINT_SIZE == 2
730 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)(x), (vhi_t)(y)))
731 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)(x), (vhi_t)(y)))
732 # define swap(x) ((vec_t)__builtin_ia32_pshufd( \
733 (vsi_t)__builtin_ia32_pshufhw( \
734 __builtin_ia32_pshuflw((vhi_t)(x), 0b00011011), 0b00011011), 0b01001110))
735 # elif INT_SIZE == 4 || UINT_SIZE == 4
736 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)(x), (vsi_t)(y)))
737 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)(x), (vsi_t)(y)))
738 # define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b00011011))
739 # elif INT_SIZE == 8 || UINT_SIZE == 8
740 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)(x), (vdi_t)(y)))
741 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)(x), (vdi_t)(y)))
742 # define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b01001110))
743 # endif
744 # if UINT_SIZE == 1
745 # define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)(x), (vqi_t)(y)))
746 # define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)(x), (vqi_t)(y)))
747 # elif INT_SIZE == 2
748 # define max(x, y) __builtin_ia32_pmaxsw128(x, y)
749 # define min(x, y) __builtin_ia32_pminsw128(x, y)
750 # define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
751 # elif UINT_SIZE == 2
752 # define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)(x), (vhi_t)(y)))
753 # elif UINT_SIZE == 4
754 # define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)(x), (vsi_t)(y)))
755 # endif
756 # define select(d, x, y, m) ({ \
757 void *d_ = (d); \
758 vqi_t m_ = (vqi_t)(m); \
759 __builtin_ia32_maskmovdqu((vqi_t)(x), m_, d_); \
760 __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
761 })
762 #elif VEC_SIZE == 32 && defined(__AVX2__)
763 # define swap_lanes(x, y, func, type) ({ \
764 long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
765 type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
766 t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
767 t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
768 func(t1_, t2_); \
769 })
770 # if INT_SIZE == 1 || UINT_SIZE == 1
771 # define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
772 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
773 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
774 (vdi_t)(x), (n) * 8))
775 # elif INT_SIZE == 2 || UINT_SIZE == 2
776 # define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
777 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
778 # define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
779 # define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
780 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
781 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
782 (vdi_t)(x), (n) * 16))
783 # elif INT_SIZE == 4 || UINT_SIZE == 4
784 # define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
785 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
786 # define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
787 # define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
788 # define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
789 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
790 (vdi_t)(x), (n) * 32))
791 # define select(d, x, y, m) ({ \
792 vsi_t m_ = (vsi_t)(m); \
793 *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x), m_); \
794 __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
795 })
796 # define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
797 # elif INT_SIZE == 8 || UINT_SIZE == 8
798 # define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
799 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
800 (vdi_t)(x), (n) * 64))
801 # define select(d, x, y, m) ({ \
802 vdi_t m_ = (vdi_t)(m); \
803 *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x), m_); \
804 __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
805 })
806 # define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
807 # define swap2(x) ({ \
808 vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
809 (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
810 })
811 # endif
812 # if INT_SIZE == 1
813 # define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
814 # define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
815 # define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
816 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
817 # define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
818 # define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
819 # elif UINT_SIZE == 1
820 # define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
821 # define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
822 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
823 # define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
824 # define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
825 # elif INT_SIZE == 2
826 # define abs(x) __builtin_ia32_pabsw256(x)
827 # define max(x, y) __builtin_ia32_pmaxsw256(x, y)
828 # define min(x, y) __builtin_ia32_pminsw256(x, y)
829 # define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
830 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
831 # define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
832 # elif UINT_SIZE == 2
833 # define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
834 # define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
835 # define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
836 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
837 # define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
838 # elif INT_SIZE == 4
839 # define abs(x) __builtin_ia32_pabsd256(x)
840 # define max(x, y) __builtin_ia32_pmaxsd256(x, y)
841 # define min(x, y) __builtin_ia32_pminsd256(x, y)
842 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
843 # elif UINT_SIZE == 4
844 # define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
845 # define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
846 # define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
847 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
848 # elif INT_SIZE == 8
849 # define broadcast(x) ({ \
850 long long s_ = (x); \
851 long long __attribute__((vector_size(16))) t_; \
852 vec_t d_; \
853 asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
854 asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
855 d_; \
856 })
857 # elif UINT_SIZE == 8
858 # define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
859 # endif
860 #endif
861 #if VEC_SIZE == 16 && defined(__SSE3__)
862 # if FLOAT_SIZE == 4
863 # define addsub(x, y) __builtin_ia32_addsubps(x, y)
864 # define dup_hi(x) __builtin_ia32_movshdup(x)
865 # define dup_lo(x) __builtin_ia32_movsldup(x)
866 # define hadd(x, y) __builtin_ia32_haddps(x, y)
867 # define hsub(x, y) __builtin_ia32_hsubps(x, y)
868 # elif FLOAT_SIZE == 8
869 # define addsub(x, y) __builtin_ia32_addsubpd(x, y)
870 # define dup_lo(x) ({ \
871 double __attribute__((vector_size(16))) r_; \
872 asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
873 r_; \
874 })
875 # define hadd(x, y) __builtin_ia32_haddpd(x, y)
876 # define hsub(x, y) __builtin_ia32_hsubpd(x, y)
877 # endif
878 #elif VEC_SIZE == 32 && defined(__AVX__)
879 # if FLOAT_SIZE == 4
880 # define addsub(x, y) __builtin_ia32_addsubps256(x, y)
881 # define dup_hi(x) __builtin_ia32_movshdup256(x)
882 # define dup_lo(x) __builtin_ia32_movsldup256(x)
883 # ifdef __AVX2__
884 # define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
885 (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
886 # define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
887 (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
888 # else
889 # define hadd(x, y) ({ \
890 vec_t t_ = __builtin_ia32_haddps256(x, y); \
891 (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
892 })
893 # define hsub(x, y) ({ \
894 vec_t t_ = __builtin_ia32_hsubps256(x, y); \
895 (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
896 })
897 # endif
898 # elif FLOAT_SIZE == 8
899 # define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
900 # define dup_lo(x) __builtin_ia32_movddup256(x)
901 # ifdef __AVX2__
902 # define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
903 # define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
904 # else
905 # define hadd(x, y) ({ \
906 vec_t t_ = __builtin_ia32_haddpd256(x, y); \
907 (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
908 })
909 # define hsub(x, y) ({ \
910 vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
911 (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
912 })
913 # endif
914 # endif
915 #endif
916 #if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
917 # if INT_SIZE == 1
918 # define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
919 # elif INT_SIZE == 2
920 # define abs(x) __builtin_ia32_pabsw128(x)
921 # elif INT_SIZE == 4
922 # define abs(x) __builtin_ia32_pabsd128(x)
923 # endif
924 # if INT_SIZE == 1 || UINT_SIZE == 1
925 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
926 # define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
927 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
928 # elif INT_SIZE == 2 || UINT_SIZE == 2
929 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
930 # define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
931 # define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
932 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
933 # elif INT_SIZE == 4 || UINT_SIZE == 4
934 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
935 # define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
936 # define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
937 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
938 # elif INT_SIZE == 8 || UINT_SIZE == 8
939 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
940 # endif
941 #endif
942 #if VEC_SIZE == 16 && defined(__SSE4_1__) && !defined(__AVX512VL__)
943 # if INT_SIZE == 1
944 # define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
945 # define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
946 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
947 # define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
948 # define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
949 # elif INT_SIZE == 2
950 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
951 # define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
952 # elif INT_SIZE == 4
953 # define max(x, y) __builtin_ia32_pmaxsd128(x, y)
954 # define min(x, y) __builtin_ia32_pminsd128(x, y)
955 # define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
956 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
957 # elif UINT_SIZE == 1
958 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
959 # define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
960 # define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
961 # elif UINT_SIZE == 2
962 # define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
963 # define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
964 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
965 # define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
966 # elif UINT_SIZE == 4
967 # define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
968 # define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
969 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
970 # endif
971 # undef select
972 # if defined(INT_SIZE) || defined(UINT_SIZE)
973 # define select(d, x, y, m) \
974 (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
975 # elif FLOAT_SIZE == 4
976 # define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
977 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
978 # define trunc(x) __builtin_ia32_roundps(x, 0b1011)
979 # elif FLOAT_SIZE == 8
980 # define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
981 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
982 # define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
983 # endif
984 # if INT_SIZE == 2 || UINT_SIZE == 2
985 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
986 # elif INT_SIZE == 4 || UINT_SIZE == 4
987 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
988 # elif INT_SIZE == 8 || UINT_SIZE == 8
989 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
990 # elif FLOAT_SIZE == 4
991 # define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
992 # elif FLOAT_SIZE == 8
993 # define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
994 # endif
995 #endif
996 #if VEC_SIZE == 32 && defined(__AVX__) && !defined(__AVX512VL__)
997 # if FLOAT_SIZE == 4
998 # define dot_product(x, y) ({ \
999 vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
1000 (vec_t){t_[0] + t_[4]}; \
1001 })
1002 # define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
1003 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
1004 # define select2(d, x, y, m) ({ \
1005 vsi_t m_ = (vsi_t)(m); \
1006 *(d) = __builtin_ia32_maskloadps256(&(x), m_); \
1007 __builtin_ia32_maskstoreps256(d, ~m_, y); \
1008 })
1009 # define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
1010 # elif FLOAT_SIZE == 8
1011 # define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
1012 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
1013 # define select2(d, x, y, m) ({ \
1014 vdi_t m_ = (vdi_t)(m); \
1015 *(d) = __builtin_ia32_maskloadpd256(&(x), m_); \
1016 __builtin_ia32_maskstorepd256(d, ~m_, y); \
1017 })
1018 # define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
1019 # endif
1020 #endif
1021 #if VEC_SIZE == FLOAT_SIZE
1022 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
1023 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
1024 # if defined(__SSE4_1__) && !defined(__AVX512F__)
1025 # if FLOAT_SIZE == 4
1026 # define trunc(x) scalar_1op(x, "roundss $0b1011, %[in], %[out]")
1027 # elif FLOAT_SIZE == 8
1028 # define trunc(x) scalar_1op(x, "roundsd $0b1011, %[in], %[out]")
1029 # endif
1030 # endif
1031 #endif
1032 #ifdef __XOP__
1033 # undef select
1034 # if VEC_SIZE == 16
1035 # if INT_SIZE == 2 || INT_SIZE == 4
1036 # include "simd-fma.c"
1037 # endif
1038 # define select(d, x, y, m) \
1039 (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
1040 # if INT_SIZE == 1 || UINT_SIZE == 1
1041 # define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
1042 # elif INT_SIZE == 2 || UINT_SIZE == 2
1043 # define swap2(x) \
1044 ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
1045 (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
1046 (2 * inv - 2))))
1047 # elif FLOAT_SIZE == 4
1048 # define frac(x) __builtin_ia32_vfrczps(x)
1049 # undef swap2
1050 # define swap2(x) ({ \
1051 /* Buggy in gcc 7.1.0 and earlier. */ \
1052 /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
1053 vec_t t_; \
1054 asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
1055 "=x" (t_) : \
1056 "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
1057 t_; \
1058 })
1059 # elif FLOAT_SIZE == 8
1060 # define frac(x) __builtin_ia32_vfrczpd(x)
1061 # undef swap2
1062 # define swap2(x) ({ \
1063 /* Buggy in gcc 7.1.0 and earlier. */ \
1064 /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
1065 /* __builtin_ia32_pmovsxdq128( */ \
1066 /* __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
1067 vdi_t s_ = __builtin_ia32_pmovsxdq128( \
1068 __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
1069 vec_t t_; \
1070 asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
1071 "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
1072 t_; \
1073 })
1074 # endif
1075 # if INT_SIZE == 1
1076 # define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
1077 __builtin_ia32_vphaddbw((vqi_t)(y))))
1078 # define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
1079 __builtin_ia32_vphsubbw((vqi_t)(y))))
1080 # elif UINT_SIZE == 1
1081 # define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
1082 __builtin_ia32_vphaddubw((vqi_t)(y))))
1083 # elif INT_SIZE == 2
1084 # undef hadd
1085 # define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
1086 __builtin_ia32_vphaddwd(y))
1087 # undef hsub
1088 # define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
1089 __builtin_ia32_vphsubwd(y))
1090 # elif UINT_SIZE == 2
1091 # undef hadd
1092 # define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
1093 __builtin_ia32_vphadduwd((vhi_t)(y))))
1094 # undef hsub
1095 # endif
1096 # elif VEC_SIZE == 32
1097 # define select(d, x, y, m) \
1098 (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
1099 # if FLOAT_SIZE == 4
1100 # define frac(x) __builtin_ia32_vfrczps256(x)
1101 # elif FLOAT_SIZE == 8
1102 # define frac(x) __builtin_ia32_vfrczpd256(x)
1103 # endif
1104 # elif VEC_SIZE == FLOAT_SIZE
1105 # if VEC_SIZE == 4
1106 # define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
1107 # elif VEC_SIZE == 8
1108 # define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
1109 # endif
1110 # endif
1111 #endif
1112
1113 #if VEC_SIZE >= 16
1114
1115 # if !defined(low_half) && defined(HALF_SIZE)
low_half(vec_t x)1116 static inline half_t low_half(vec_t x)
1117 {
1118 # if HALF_SIZE < VEC_SIZE
1119 half_t y;
1120 unsigned int i;
1121
1122 for ( i = 0; i < ELEM_COUNT / 2; ++i )
1123 y[i] = x[i];
1124
1125 return y;
1126 # else
1127 return x;
1128 # endif
1129 }
1130 # endif
1131
1132 # if !defined(low_quarter) && defined(QUARTER_SIZE)
low_quarter(vec_t x)1133 static inline quarter_t low_quarter(vec_t x)
1134 {
1135 # if QUARTER_SIZE < VEC_SIZE
1136 quarter_t y;
1137 unsigned int i;
1138
1139 for ( i = 0; i < ELEM_COUNT / 4; ++i )
1140 y[i] = x[i];
1141
1142 return y;
1143 # else
1144 return x;
1145 # endif
1146 }
1147 # endif
1148
1149 # if !defined(low_eighth) && defined(EIGHTH_SIZE)
low_eighth(vec_t x)1150 static inline eighth_t low_eighth(vec_t x)
1151 {
1152 # if EIGHTH_SIZE < VEC_SIZE
1153 eighth_t y;
1154 unsigned int i;
1155
1156 for ( i = 0; i < ELEM_COUNT / 8; ++i )
1157 y[i] = x[i];
1158
1159 return y;
1160 # else
1161 return x;
1162 # endif
1163 }
1164 # endif
1165
1166 #endif
1167
1168 #ifdef broadcast_pair
1169 # if ELEM_COUNT == 4
1170 # define broadcast_half broadcast_pair
1171 # elif ELEM_COUNT == 8
1172 # define broadcast_quarter broadcast_pair
1173 # elif ELEM_COUNT == 16
1174 # define broadcast_eighth broadcast_pair
1175 # endif
1176 #endif
1177
1178 #ifdef insert_pair
1179 # if ELEM_COUNT == 4
1180 # define insert_half insert_pair
1181 # elif ELEM_COUNT == 8
1182 # define insert_quarter insert_pair
1183 # elif ELEM_COUNT == 16
1184 # define insert_eighth insert_pair
1185 # endif
1186 #endif
1187
1188 #ifdef broadcast_quartet
1189 # if ELEM_COUNT == 8
1190 # define broadcast_half broadcast_quartet
1191 # elif ELEM_COUNT == 16
1192 # define broadcast_quarter broadcast_quartet
1193 # endif
1194 #endif
1195
1196 #ifdef insert_quartet
1197 # if ELEM_COUNT == 8
1198 # define insert_half insert_quartet
1199 # elif ELEM_COUNT == 16
1200 # define insert_quarter insert_quartet
1201 # endif
1202 #endif
1203
1204 #if defined(broadcast_octet) && ELEM_COUNT == 16
1205 # define broadcast_half broadcast_octet
1206 #endif
1207
1208 #if defined(insert_octet) && ELEM_COUNT == 16
1209 # define insert_half insert_octet
1210 #endif
1211
1212 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
1213 # include "simd-fma.c"
1214 #endif
1215
simd_test(void)1216 int simd_test(void)
1217 {
1218 unsigned int i, j;
1219 vec_t x, y, z, src, inv, alt, sh;
1220 vint_t interleave_lo, interleave_hi;
1221
1222 for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
1223 {
1224 src[i] = i + 1;
1225 inv[i] = ELEM_COUNT - i;
1226 #ifdef UINT_SIZE
1227 alt[i] = -!(i & 1);
1228 #else
1229 alt[i] = i & 1 ? -1 : 1;
1230 #endif
1231 if ( !(i & (i + 1)) )
1232 --j;
1233 sh[i] = j;
1234
1235 interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1);
1236 interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2);
1237 }
1238
1239 touch(src);
1240 x = src;
1241 touch(x);
1242 if ( !eq(x, src) ) return __LINE__;
1243
1244 touch(src);
1245 y = x + src;
1246 touch(src);
1247 touch(y);
1248 if ( !eq(y, 2 * src) ) return __LINE__;
1249
1250 touch(src);
1251 z = y -= src;
1252 touch(z);
1253 if ( !eq(x, z) ) return __LINE__;
1254
1255 #if defined(UINT_SIZE)
1256
1257 touch(inv);
1258 x |= inv;
1259 touch(inv);
1260 y &= inv;
1261 touch(inv);
1262 z ^= inv;
1263 touch(inv);
1264 touch(x);
1265 if ( !eq(x & ~y, z) ) return __LINE__;
1266
1267 #elif ELEM_SIZE > 1 || VEC_SIZE <= 8
1268
1269 touch(src);
1270 x *= src;
1271 y = inv * inv;
1272 touch(src);
1273 z = src + inv;
1274 touch(inv);
1275 z *= (src - inv);
1276 if ( !eq(x - y, z) ) return __LINE__;
1277
1278 #endif
1279
1280 #if defined(FLOAT_SIZE)
1281
1282 x = src * alt;
1283 touch(alt);
1284 y = src / alt;
1285 if ( !eq(x, y) ) return __LINE__;
1286 touch(alt);
1287 touch(src);
1288 if ( !eq(x * -alt, -src) ) return __LINE__;
1289
1290 # ifdef to_int
1291
1292 touch(src);
1293 x = to_int(src);
1294 touch(src);
1295 if ( !eq(x, src) ) return __LINE__;
1296
1297 # ifdef recip
1298 touch(src);
1299 x = recip(src);
1300 touch(src);
1301 touch(x);
1302 if ( !eq(to_int(recip(x)), src) ) return __LINE__;
1303
1304 # ifdef rsqrt
1305 x = src * src;
1306 touch(x);
1307 y = rsqrt(x);
1308 touch(y);
1309 if ( !eq(to_int(recip(y)), src) ) return __LINE__;
1310 touch(src);
1311 if ( !eq(to_int(y), to_int(recip(src))) ) return __LINE__;
1312 # endif
1313 # endif
1314
1315 # endif
1316
1317 # ifdef to_wint
1318 touch(src);
1319 x = to_wint(src);
1320 touch(src);
1321 if ( !eq(x, src) ) return __LINE__;
1322 # endif
1323
1324 # ifdef to_uint
1325 touch(src);
1326 x = to_uint(src);
1327 touch(src);
1328 if ( !eq(x, src) ) return __LINE__;
1329 # endif
1330
1331 # ifdef to_uwint
1332 touch(src);
1333 x = to_uwint(src);
1334 touch(src);
1335 if ( !eq(x, src) ) return __LINE__;
1336 # endif
1337
1338 # ifdef sqrt
1339 x = src * src;
1340 touch(x);
1341 if ( !eq(sqrt(x), src) ) return __LINE__;
1342 # endif
1343
1344 # ifdef trunc
1345 x = 1 / src;
1346 y = (vec_t){ 1 };
1347 touch(x);
1348 z = trunc(x);
1349 if ( !eq(y, z) ) return __LINE__;
1350 # endif
1351
1352 # ifdef frac
1353 touch(src);
1354 x = frac(src);
1355 touch(src);
1356 if ( !eq(x, (vec_t){}) ) return __LINE__;
1357
1358 x = 1 / (src + 1);
1359 touch(x);
1360 y = frac(x);
1361 touch(x);
1362 if ( !eq(x, y) ) return __LINE__;
1363 # endif
1364
1365 # if defined(trunc) && defined(frac)
1366 x = src / 4;
1367 touch(x);
1368 y = trunc(x);
1369 touch(x);
1370 z = frac(x);
1371 touch(x);
1372 if ( !eq(x, y + z) ) return __LINE__;
1373 # endif
1374
1375 #else
1376
1377 # if ELEM_SIZE > 1
1378
1379 touch(inv);
1380 x = src * inv;
1381 touch(inv);
1382 y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
1383 for ( i = 1; i < ELEM_COUNT / 2; ++i )
1384 y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
1385 if ( !eq(x, y) ) return __LINE__;
1386
1387 # ifdef mul_hi
1388 touch(alt);
1389 x = mul_hi(src, alt);
1390 touch(alt);
1391 # ifdef INT_SIZE
1392 if ( !eq(x, alt < 0) ) return __LINE__;
1393 # else
1394 if ( !eq(x, (src & alt) + alt) ) return __LINE__;
1395 # endif
1396 # endif
1397
1398 # ifdef mul_full
1399 x = src ^ alt;
1400 touch(inv);
1401 y = mul_full(x, inv);
1402 touch(inv);
1403 for ( i = 0; i < ELEM_COUNT; i += 2 )
1404 {
1405 unsigned long long res = x[i] * 1ULL * inv[i];
1406
1407 z[i] = res;
1408 z[i + 1] = res >> (ELEM_SIZE << 3);
1409 }
1410 if ( !eq(y, z) ) return __LINE__;
1411 # endif
1412
1413 z = src;
1414 # ifdef INT_SIZE
1415 z *= alt;
1416 # endif
1417 touch(z);
1418 x = z << 3;
1419 touch(z);
1420 y = z << 2;
1421 touch(z);
1422 if ( !eq(x, y + y) ) return __LINE__;
1423
1424 touch(x);
1425 z = x >> 2;
1426 touch(x);
1427 if ( !eq(y, z + z) ) return __LINE__;
1428
1429 z = src;
1430 # ifdef INT_SIZE
1431 z *= alt;
1432 # endif
1433 /*
1434 * Note that despite the touch()-es here there doesn't appear to be a way
1435 * to make the compiler use a memory operand for the shift instruction (at
1436 * least without resorting to built-ins).
1437 */
1438 j = 3;
1439 touch(j);
1440 x = z << j;
1441 touch(j);
1442 j = 2;
1443 touch(j);
1444 y = z << j;
1445 touch(j);
1446 if ( !eq(x, y + y) ) return __LINE__;
1447
1448 z = x >> j;
1449 touch(j);
1450 if ( !eq(y, z + z) ) return __LINE__;
1451
1452 # endif
1453
1454 # if ELEM_SIZE == 2 || defined(__SSE4_1__)
1455 /*
1456 * Even when there are no instructions with varying shift counts per
1457 * field, the code turns out to be a nice exercise for pextr/pinsr.
1458 */
1459 z = src;
1460 # ifdef INT_SIZE
1461 z *= alt;
1462 # endif
1463 /*
1464 * Zap elements for which the shift count is zero (and the hence the
1465 * decrement below would yield a negative count.
1466 */
1467 z &= (sh > 0);
1468 touch(sh);
1469 x = z << sh;
1470 touch(sh);
1471 --sh;
1472 touch(sh);
1473 y = z << sh;
1474 if ( !eq(x, y + y) ) return __LINE__;
1475
1476 # if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
1477 touch(sh);
1478 x = y >> sh;
1479 if ( !eq(x, z) ) return __LINE__;
1480 # endif
1481
1482 # endif
1483
1484 #endif
1485
1486 #if defined(max) && defined(min)
1487 # ifdef UINT_SIZE
1488 touch(inv);
1489 x = min(src, inv);
1490 touch(inv);
1491 y = max(src, inv);
1492 touch(inv);
1493 if ( !eq(x + y, src + inv) ) return __LINE__;
1494 # else
1495 x = src * alt;
1496 y = inv * alt;
1497 touch(y);
1498 z = max(x, y);
1499 touch(y);
1500 y = min(x, y);
1501 touch(y);
1502 if ( !eq((y + z) * alt, src + inv) ) return __LINE__;
1503 # endif
1504 #endif
1505
1506 #ifdef abs
1507 x = src * alt;
1508 touch(x);
1509 if ( !eq(abs(x), src) ) return __LINE__;
1510 #endif
1511
1512 #ifdef copysignz
1513 touch(alt);
1514 if ( !eq(copysignz((vec_t){} + 1, alt), alt) ) return __LINE__;
1515 #endif
1516
1517 #ifdef swap
1518 touch(src);
1519 if ( !eq(swap(src), inv) ) return __LINE__;
1520 #endif
1521
1522 #ifdef swap2
1523 touch(src);
1524 if ( !eq(swap2(src), inv) ) return __LINE__;
1525 #endif
1526
1527 #ifdef swap3
1528 touch(src);
1529 if ( !eq(swap3(src), inv) ) return __LINE__;
1530 touch(src);
1531 #endif
1532
1533 #ifdef broadcast
1534 if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
1535 #endif
1536
1537 #ifdef broadcast2
1538 if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
1539 #endif
1540
1541 #if defined(broadcast_half) && defined(insert_half)
1542 {
1543 half_t aux = low_half(src);
1544
1545 touch(aux);
1546 x = broadcast_half(aux);
1547 touch(aux);
1548 y = insert_half(src, aux, 1);
1549 if ( !eq(x, y) ) return __LINE__;
1550 }
1551 #endif
1552
1553 #if defined(broadcast_quarter) && defined(insert_quarter)
1554 {
1555 quarter_t aux = low_quarter(src);
1556
1557 touch(aux);
1558 x = broadcast_quarter(aux);
1559 touch(aux);
1560 y = insert_quarter(src, aux, 1);
1561 touch(aux);
1562 y = insert_quarter(y, aux, 2);
1563 touch(aux);
1564 y = insert_quarter(y, aux, 3);
1565 if ( !eq(x, y) ) return __LINE__;
1566 }
1567 #endif
1568
1569 #if defined(broadcast_eighth) && defined(insert_eighth) && \
1570 /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
1571 __GNUC__ >= 8
1572 {
1573 eighth_t aux = low_eighth(src);
1574
1575 touch(aux);
1576 x = broadcast_eighth(aux);
1577 touch(aux);
1578 y = insert_eighth(src, aux, 1);
1579 touch(aux);
1580 y = insert_eighth(y, aux, 2);
1581 touch(aux);
1582 y = insert_eighth(y, aux, 3);
1583 touch(aux);
1584 y = insert_eighth(y, aux, 4);
1585 touch(aux);
1586 y = insert_eighth(y, aux, 5);
1587 touch(aux);
1588 y = insert_eighth(y, aux, 6);
1589 touch(aux);
1590 y = insert_eighth(y, aux, 7);
1591 if ( !eq(x, y) ) return __LINE__;
1592 }
1593 #endif
1594
1595 #if defined(interleave_lo) && defined(interleave_hi)
1596 touch(src);
1597 x = interleave_lo(inv, src);
1598 touch(src);
1599 y = interleave_hi(inv, src);
1600 touch(src);
1601 # ifdef UINT_SIZE
1602 z = ((x - y) ^ ~alt) - ~alt;
1603 # else
1604 z = (x - y) * alt;
1605 # endif
1606 # ifdef broadcast
1607 if ( !eq(z, broadcast(ELEM_COUNT / 2)) ) return __LINE__;
1608 # else
1609 if ( !eq(z, ELEM_COUNT / 2) ) return __LINE__;
1610 # endif
1611 #endif
1612
1613 #if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
1614
1615 x = src * alt;
1616 y = interleave_lo(x, alt < 0);
1617 touch(x);
1618 z = widen1(low_half(x));
1619 touch(x);
1620 if ( !eq(z, y) ) return __LINE__;
1621
1622 # ifdef widen2
1623 y = interleave_lo(alt < 0, alt < 0);
1624 y = interleave_lo(z, y);
1625 touch(x);
1626 z = widen2(low_quarter(x));
1627 touch(x);
1628 if ( !eq(z, y) ) return __LINE__;
1629
1630 # ifdef widen3
1631 y = interleave_lo(alt < 0, alt < 0);
1632 y = interleave_lo(y, y);
1633 y = interleave_lo(z, y);
1634 touch(x);
1635 z = widen3(low_eighth(x));
1636 touch(x);
1637 if ( !eq(z, y) ) return __LINE__;
1638 # endif
1639 # endif
1640
1641 #endif
1642
1643 #if defined(UINT_SIZE) && defined(interleave_lo)
1644
1645 y = interleave_lo(src, (vec_t){});
1646 z = interleave_lo(y, (vec_t){});
1647
1648 # ifdef widen1
1649 touch(src);
1650 x = widen1(low_half(src));
1651 touch(src);
1652 if ( !eq(x, y) ) return __LINE__;
1653 # endif
1654
1655 # ifdef widen2
1656 touch(src);
1657 x = widen2(low_quarter(src));
1658 touch(src);
1659 if ( !eq(x, z) ) return __LINE__;
1660 # endif
1661
1662 # ifdef widen3
1663 touch(src);
1664 x = widen3(low_eighth(src));
1665 touch(src);
1666 if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
1667 # endif
1668
1669 #endif
1670
1671 #if defined(widen1) && defined(shrink1)
1672 {
1673 half_t aux1 = low_half(src), aux2;
1674
1675 touch(aux1);
1676 x = widen1(aux1);
1677 touch(x);
1678 aux2 = shrink1(x);
1679 touch(aux2);
1680 for ( i = 0; i < ELEM_COUNT / 2; ++i )
1681 if ( aux2[i] != src[i] )
1682 return __LINE__;
1683 }
1684 #endif
1685
1686 #if defined(widen2) && defined(shrink2)
1687 {
1688 quarter_t aux1 = low_quarter(src), aux2;
1689
1690 touch(aux1);
1691 x = widen2(aux1);
1692 touch(x);
1693 aux2 = shrink2(x);
1694 touch(aux2);
1695 for ( i = 0; i < ELEM_COUNT / 4; ++i )
1696 if ( aux2[i] != src[i] )
1697 return __LINE__;
1698 }
1699 #endif
1700
1701 #if defined(widen3) && defined(shrink3)
1702 {
1703 eighth_t aux1 = low_eighth(src), aux2;
1704
1705 touch(aux1);
1706 x = widen3(aux1);
1707 touch(x);
1708 aux2 = shrink3(x);
1709 touch(aux2);
1710 for ( i = 0; i < ELEM_COUNT / 8; ++i )
1711 if ( aux2[i] != src[i] )
1712 return __LINE__;
1713 }
1714 #endif
1715
1716 #ifdef dup_lo
1717 touch(src);
1718 x = dup_lo(src);
1719 touch(src);
1720 if ( !eq(x - src, (alt - 1) / 2) ) return __LINE__;
1721 #endif
1722
1723 #ifdef dup_hi
1724 touch(src);
1725 x = dup_hi(src);
1726 touch(src);
1727 if ( !eq(x - src, (alt + 1) / 2) ) return __LINE__;
1728 #endif
1729
1730 for ( i = 0; i < ELEM_COUNT; ++i )
1731 y[i] = (i & 1 ? inv : src)[i];
1732
1733 #ifdef select
1734 # ifdef UINT_SIZE
1735 select(&z, src, inv, alt);
1736 # else
1737 select(&z, src, inv, alt > 0);
1738 # endif
1739 if ( !eq(z, y) ) return __LINE__;
1740 #endif
1741
1742 #ifdef select2
1743 # ifdef UINT_SIZE
1744 select2(&z, src, inv, alt);
1745 # else
1746 select2(&z, src, inv, alt > 0);
1747 # endif
1748 if ( !eq(z, y) ) return __LINE__;
1749 #endif
1750
1751 #ifdef mix
1752 touch(src);
1753 touch(inv);
1754 x = mix(src, inv);
1755 if ( !eq(x, y) ) return __LINE__;
1756
1757 # ifdef addsub
1758 touch(src);
1759 touch(inv);
1760 x = addsub(src, inv);
1761 touch(src);
1762 touch(inv);
1763 y = mix(src - inv, src + inv);
1764 if ( !eq(x, y) ) return __LINE__;
1765 # endif
1766 #endif
1767
1768 #ifdef rotr
1769 x = rotr(src, 1);
1770 y = (src & (ELEM_COUNT - 1)) + 1;
1771 if ( !eq(x, y) ) return __LINE__;
1772 #endif
1773
1774 #ifdef dot_product
1775 touch(src);
1776 touch(inv);
1777 x = dot_product(src, inv);
1778 if ( !eq(x, (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
1779 (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
1780 #endif
1781
1782 #ifdef hadd
1783 # if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
1784 (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
1785 x = src;
1786 for ( i = ELEM_COUNT; i >>= 1; )
1787 {
1788 touch(x);
1789 x = hadd((vec_t){}, x);
1790 }
1791 if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
1792 # endif
1793
1794 # ifdef hsub
1795 touch(src);
1796 touch(inv);
1797 x = hsub(src, inv);
1798 for ( i = ELEM_COUNT; i >>= 1; )
1799 x = hadd(x, (vec_t){});
1800 if ( !eq(x, (vec_t){}) ) return __LINE__;
1801 # endif
1802 #endif
1803
1804 #if defined(getexp) && defined(getmant)
1805 touch(src);
1806 x = getmant(src);
1807 touch(src);
1808 y = getexp(src);
1809 touch(src);
1810 for ( j = i = 0; i < ELEM_COUNT; ++i )
1811 {
1812 if ( y[i] != j ) return __LINE__;
1813
1814 if ( !((i + 1) & (i + 2)) )
1815 ++j;
1816
1817 if ( !(i & (i + 1)) && x[i] != 1 ) return __LINE__;
1818 }
1819 # ifdef scale
1820 touch(y);
1821 z = scale(x, y);
1822 if ( !eq(src, z) ) return __LINE__;
1823 # endif
1824 #endif
1825
1826 #if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
1827 (defined(__AVX512F__) && defined(FLOAT_SIZE))
1828 return -fma_test();
1829 #endif
1830
1831 return 0;
1832 }
1833