1 #include <stdbool.h>
2 
3 #if defined(__i386__) && VEC_SIZE == 16
4 # define ENTRY(name) \
5 asm ( "\t.text\n" \
6       "\t.globl _start\n" \
7       "_start:\n" \
8       "\tpush %ebp\n" \
9       "\tmov %esp,%ebp\n" \
10       "\tand $~0xf,%esp\n" \
11       "\tcall " #name "\n" \
12       "\tleave\n" \
13       "\tret" )
14 #else
15 # define ENTRY(name) \
16 asm ( "\t.text\n" \
17       "\t.globl _start\n" \
18       "_start:\n" \
19       "\tjmp " #name )
20 #endif
21 
22 typedef
23 #if defined(INT_SIZE)
24 # define ELEM_SIZE INT_SIZE
25 signed int
26 # if INT_SIZE == 1
27 #  define MODE QI
28 # elif INT_SIZE == 2
29 #  define MODE HI
30 # elif INT_SIZE == 4
31 #  define MODE SI
32 # elif INT_SIZE == 8
33 #  define MODE DI
34 # endif
35 #elif defined(UINT_SIZE)
36 # define ELEM_SIZE UINT_SIZE
37 unsigned int
38 # if UINT_SIZE == 1
39 #  define MODE QI
40 # elif UINT_SIZE == 2
41 #  define MODE HI
42 # elif UINT_SIZE == 4
43 #  define MODE SI
44 # elif UINT_SIZE == 8
45 #  define MODE DI
46 # endif
47 #elif defined(FLOAT_SIZE)
48 float
49 # define ELEM_SIZE FLOAT_SIZE
50 # if FLOAT_SIZE == 4
51 #  define MODE SF
52 # elif FLOAT_SIZE == 8
53 #  define MODE DF
54 # endif
55 #endif
56 #ifndef VEC_SIZE
57 # define VEC_SIZE ELEM_SIZE
58 #endif
59 __attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
60 
61 #define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
62 
63 typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
64 
65 /* Various builtins want plain char / int / long long vector types ... */
66 typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
67 typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
68 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
69 #if VEC_SIZE >= 8
70 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
71 typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
72 #endif
73 
74 #if ELEM_SIZE == 1
75 typedef vqi_t vint_t;
76 #elif ELEM_SIZE == 2
77 typedef vhi_t vint_t;
78 #elif ELEM_SIZE == 4
79 typedef vsi_t vint_t;
80 #elif ELEM_SIZE == 8
81 typedef vdi_t vint_t;
82 #endif
83 
84 #if VEC_SIZE >= 16
85 
86 # if ELEM_COUNT >= 2
87 #  if VEC_SIZE > 32
88 #   define HALF_SIZE (VEC_SIZE / 2)
89 #  else
90 #   define HALF_SIZE 16
91 #  endif
92 typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
93 typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
94 typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
95 typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
96 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
97 typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
98 # endif
99 
100 # if ELEM_COUNT >= 4
101 #  if VEC_SIZE > 64
102 #   define QUARTER_SIZE (VEC_SIZE / 4)
103 #  else
104 #   define QUARTER_SIZE 16
105 #  endif
106 typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) quarter_t;
107 typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t;
108 typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
109 typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
110 typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
111 # endif
112 
113 # if ELEM_COUNT >= 8
114 #  if VEC_SIZE > 128
115 #   define EIGHTH_SIZE (VEC_SIZE / 8)
116 #  else
117 #   define EIGHTH_SIZE 16
118 #  endif
119 typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) eighth_t;
120 typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t;
121 typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t;
122 typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t;
123 typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
124 # endif
125 
126 # define DECL_PAIR(w) \
127 typedef w ## _t pair_t; \
128 typedef vsi_ ## w ## _t vsi_pair_t; \
129 typedef vdi_ ## w ## _t vdi_pair_t
130 # define DECL_QUARTET(w) \
131 typedef w ## _t quartet_t; \
132 typedef vsi_ ## w ## _t vsi_quartet_t; \
133 typedef vdi_ ## w ## _t vdi_quartet_t
134 # define DECL_OCTET(w) \
135 typedef w ## _t octet_t; \
136 typedef vsi_ ## w ## _t vsi_octet_t; \
137 typedef vdi_ ## w ## _t vdi_octet_t
138 
139 # if ELEM_COUNT == 4
140 DECL_PAIR(half);
141 # elif ELEM_COUNT == 8
142 DECL_PAIR(quarter);
143 DECL_QUARTET(half);
144 # elif ELEM_COUNT == 16
145 DECL_PAIR(eighth);
146 DECL_QUARTET(quarter);
147 DECL_OCTET(half);
148 # endif
149 
150 # undef DECL_OCTET
151 # undef DECL_QUARTET
152 # undef DECL_PAIR
153 
154 #endif
155 
156 #if VEC_SIZE == 16
157 # define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
158 # define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
159 #elif VEC_SIZE == 32
160 # define B(n, s, a...)   __builtin_ia32_ ## n ## 256 ## s(a)
161 #elif VEC_SIZE == 64
162 # define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
163 # define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
164 #endif
165 #ifndef B_
166 # define B_ B
167 #endif
168 #ifndef BR
169 # define BR B
170 # define BR_ B_
171 #endif
172 #ifndef BR_
173 # define BR_ BR
174 #endif
175 
176 #ifdef __AVX512F__
177 
178 /* Sadly there are a few exceptions to the general naming rules. */
179 # define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512
180 # define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512
181 # define __builtin_ia32_exp2pd512_mask __builtin_ia32_exp2pd_mask
182 # define __builtin_ia32_exp2ps512_mask __builtin_ia32_exp2ps_mask
183 # define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask
184 # define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask
185 # define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask
186 # define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
187 # define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
188 # define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
189 # define __builtin_ia32_rcp28pd512_mask __builtin_ia32_rcp28pd_mask
190 # define __builtin_ia32_rcp28ps512_mask __builtin_ia32_rcp28ps_mask
191 # define __builtin_ia32_rndscalepd_512_mask __builtin_ia32_rndscalepd_mask
192 # define __builtin_ia32_rndscaleps_512_mask __builtin_ia32_rndscaleps_mask
193 # define __builtin_ia32_rsqrt28pd512_mask __builtin_ia32_rsqrt28pd_mask
194 # define __builtin_ia32_rsqrt28ps512_mask __builtin_ia32_rsqrt28ps_mask
195 # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
196 # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
197 # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
198 # define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
199 
200 # if VEC_SIZE > ELEM_SIZE && (defined(VEC_MAX) ? VEC_MAX : VEC_SIZE) < 64
201 #  pragma GCC target ( "avx512vl" )
202 # endif
203 
204 # define REN(insn, old, new)                     \
205     asm ( ".macro v" #insn #old " o:vararg \n\t" \
206           "v" #insn #new " \\o             \n\t" \
207           ".endm" )
208 
209 /*
210  * The original plan was to effect use of EVEX encodings for scalar as well as
211  * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
212  * only of course) XMM16-XMM31 only. All sorts of compiler errors result when
213  * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes,
214  * which has the benefit of also working for 32-bit. Granted, there is a lot of
215  * escaping to get right here.
216  */
217 asm ( ".macro override insn    \n\t"
218       ".macro $\\insn o:vararg \n\t"
219       ".purgem \\insn          \n\t"
220       "{evex} \\insn \\(\\)o   \n\t"
221       ".macro \\insn o:vararg  \n\t"
222       "$\\insn \\(\\(\\))o     \n\t"
223       ".endm                   \n\t"
224       ".endm                   \n\t"
225       ".macro \\insn o:vararg  \n\t"
226       "$\\insn \\(\\)o         \n\t"
227       ".endm                   \n\t"
228       ".endm" );
229 
230 # define OVR(n) asm ( "override v" #n )
231 # define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss)
232 
233 # ifdef __AVX512VL__
234 #  ifdef __AVX512BW__
235 #   define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w)
236 #  else
237 #   define OVR_BW(n)
238 #  endif
239 #  define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q)
240 #  define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps)
241 # else
242 #  define OVR_BW(n)
243 #  define OVR_DQ(n)
244 #  define OVR_VFP(n)
245 # endif
246 
247 # define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \
248                        OVR_ ## w(n ## 231)
249 # define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
250 # define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
251 
252 OVR_INT(broadcast);
253 OVR_SFP(broadcast);
254 OVR_SFP(comi);
255 OVR_VFP(cvtdq2);
256 OVR_INT(abs);
257 OVR_FP(add);
258 OVR_INT(add);
259 OVR_BW(adds);
260 OVR_BW(addus);
261 OVR_BW(avg);
262 OVR_FP(div);
263 OVR(extractps);
264 OVR_FMA(fmadd, FP);
265 OVR_FMA(fmaddsub, VFP);
266 OVR_FMA(fmsub, FP);
267 OVR_FMA(fmsubadd, VFP);
268 OVR_FMA(fnmadd, FP);
269 OVR_FMA(fnmsub, FP);
270 OVR(insertps);
271 OVR_FP(max);
272 OVR_INT(maxs);
273 OVR_INT(maxu);
274 OVR_FP(min);
275 OVR_INT(mins);
276 OVR_INT(minu);
277 OVR(movd);
278 OVR(movq);
279 OVR_SFP(mov);
280 OVR_VFP(mova);
281 OVR(movhlps);
282 OVR(movhpd);
283 OVR(movhps);
284 OVR(movlhps);
285 OVR(movlpd);
286 OVR(movlps);
287 OVR_VFP(movnt);
288 OVR_VFP(movu);
289 OVR_FP(mul);
290 OVR_VFP(perm);
291 OVR_VFP(permil);
292 OVR_VFP(shuf);
293 OVR_INT(sll);
294 OVR_DQ(sllv);
295 OVR_FP(sqrt);
296 OVR_INT(sra);
297 OVR_DQ(srav);
298 OVR_INT(srl);
299 OVR_DQ(srlv);
300 OVR_FP(sub);
301 OVR_INT(sub);
302 OVR_BW(subs);
303 OVR_BW(subus);
304 OVR_SFP(ucomi);
305 OVR_VFP(unpckh);
306 OVR_VFP(unpckl);
307 
308 # ifdef __AVX512VL__
309 #  if ELEM_SIZE == 8 && defined(__AVX512DQ__)
310 REN(extract, f128, f64x2);
311 REN(extract, i128, i64x2);
312 REN(insert, f128, f64x2);
313 REN(insert, i128, i64x2);
314 #  else
315 REN(extract, f128, f32x4);
316 REN(extract, i128, i32x4);
317 REN(insert, f128, f32x4);
318 REN(insert, i128, i32x4);
319 #  endif
320 #  if ELEM_SIZE == 8
321 REN(movdqa, , 64);
322 REN(movdqu, , 64);
323 REN(pand, , q);
324 REN(pandn, , q);
325 REN(por, , q);
326 REN(pxor, , q);
327 #  else
328 #   if ELEM_SIZE == 1 && defined(__AVX512BW__)
329 REN(movdq, a, u8);
330 REN(movdqu, , 8);
331 #   elif ELEM_SIZE == 2 && defined(__AVX512BW__)
332 REN(movdq, a, u16);
333 REN(movdqu, , 16);
334 #   else
335 REN(movdqa, , 32);
336 REN(movdqu, , 32);
337 #   endif
338 REN(pand, , d);
339 REN(pandn, , d);
340 REN(por, , d);
341 REN(pxor, , d);
342 #  endif
343 OVR(aesdec);
344 OVR(aesdeclast);
345 OVR(aesenc);
346 OVR(aesenclast);
347 OVR(cvtpd2dqx);
348 OVR(cvtpd2dqy);
349 OVR(cvtpd2psx);
350 OVR(cvtpd2psy);
351 OVR(cvtph2ps);
352 OVR(cvtps2dq);
353 OVR(cvtps2pd);
354 OVR(cvtps2ph);
355 OVR(cvtsd2ss);
356 OVR(cvtsd2si);
357 OVR(cvtsd2sil);
358 OVR(cvtsd2siq);
359 OVR(cvtsi2sd);
360 OVR(cvtsi2sdl);
361 OVR(cvtsi2sdq);
362 OVR(cvtsi2ss);
363 OVR(cvtsi2ssl);
364 OVR(cvtsi2ssq);
365 OVR(cvtss2sd);
366 OVR(cvtss2si);
367 OVR(cvtss2sil);
368 OVR(cvtss2siq);
369 OVR(cvttpd2dqx);
370 OVR(cvttpd2dqy);
371 OVR(cvttps2dq);
372 OVR(cvttsd2si);
373 OVR(cvttsd2sil);
374 OVR(cvttsd2siq);
375 OVR(cvttss2si);
376 OVR(cvttss2sil);
377 OVR(cvttss2siq);
378 OVR(gf2p8mulb);
379 OVR(movddup);
380 OVR(movntdq);
381 OVR(movntdqa);
382 OVR(movshdup);
383 OVR(movsldup);
384 OVR(pclmulqdq);
385 OVR(permd);
386 OVR(permq);
387 OVR(pmovsxbd);
388 OVR(pmovsxbq);
389 OVR(pmovsxdq);
390 OVR(pmovsxwd);
391 OVR(pmovsxwq);
392 OVR(pmovzxbd);
393 OVR(pmovzxbq);
394 OVR(pmovzxdq);
395 OVR(pmovzxwd);
396 OVR(pmovzxwq);
397 OVR(pmulld);
398 OVR(pmuldq);
399 OVR(pmuludq);
400 OVR(pshufd);
401 OVR(punpckhdq);
402 OVR(punpckhqdq);
403 OVR(punpckldq);
404 OVR(punpcklqdq);
405 # endif
406 
407 # ifdef __AVX512BW__
408 OVR(pextrb);
409 OVR(pextrw);
410 OVR(pinsrb);
411 OVR(pinsrw);
412 #  ifdef __AVX512VL__
413 OVR(packssdw);
414 OVR(packsswb);
415 OVR(packusdw);
416 OVR(packuswb);
417 OVR(palignr);
418 OVR(pmaddubsw);
419 OVR(pmaddwd);
420 OVR(pmovsxbw);
421 OVR(pmovzxbw);
422 OVR(pmulhrsw);
423 OVR(pmulhuw);
424 OVR(pmulhw);
425 OVR(pmullw);
426 OVR(psadbw);
427 OVR(pshufb);
428 OVR(pshufhw);
429 OVR(pshuflw);
430 OVR(pslldq);
431 OVR(psrldq);
432 OVR(punpckhbw);
433 OVR(punpckhwd);
434 OVR(punpcklbw);
435 OVR(punpcklwd);
436 #  endif
437 # endif
438 
439 # ifdef __AVX512DQ__
440 OVR_VFP(and);
441 OVR_VFP(andn);
442 OVR_VFP(or);
443 OVR(pextrd);
444 OVR(pextrq);
445 OVR(pinsrd);
446 OVR(pinsrq);
447 #  ifdef __AVX512VL__
448 OVR(pmullq);
449 #  endif
450 OVR_VFP(xor);
451 # endif
452 
453 # undef OVR_VFP
454 # undef OVR_SFP
455 # undef OVR_INT
456 # undef OVR_FP
457 # undef OVR_FMA
458 # undef OVR_DQ
459 # undef OVR_BW
460 # undef OVR
461 
462 #endif /* __AVX512F__ */
463 
464 /*
465  * Suppress value propagation by the compiler, preventing unwanted
466  * optimization. This at once makes the compiler use memory operands
467  * more often, which for our purposes is the more interesting case.
468  */
469 #define touch(var) asm volatile ( "" : "+m" (var) )
470 
undef(void)471 static inline vec_t undef(void)
472 {
473     vec_t v = v;
474     return v;
475 }
476