1 #ifdef INT_SIZE
2 # define ELEM_SIZE INT_SIZE
3 #else
4 # define ELEM_SIZE FLOAT_SIZE
5 #endif
6 
7 #define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \
8                                         : VEC_MAX * ELEM_SIZE / IDX_SIZE)
9 #if VEC_SIZE < 16
10 # undef VEC_SIZE
11 # define VEC_SIZE 16
12 #endif
13 
14 #include "simd.h"
15 
16 ENTRY(sg_test);
17 
18 #undef MODE
19 #if IDX_SIZE == 4
20 # define MODE SI
21 #elif IDX_SIZE == 8
22 # define MODE DI
23 #endif
24 
25 #define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \
26                                          : VEC_MAX * IDX_SIZE / ELEM_SIZE)
27 #if IVEC_SIZE < 16
28 # undef IVEC_SIZE
29 # define IVEC_SIZE 16
30 #endif
31 
32 typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t;
33 typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t;
34 
35 #define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
36                     VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
37 
38 #if defined(__AVX512F__)
39 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
40 # if ELEM_SIZE == 4
41 #  if IDX_SIZE == 4 || defined(__AVX512VL__)
42 #   define to_mask(msk) B(ptestmd, , (vsi_t)(msk), (vsi_t)(msk), ~0)
43 #   define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
44 #  else
45 #   define widen(x) __builtin_ia32_pmovzxdq512_mask((vsi_t)(x), (idi_t){}, ~0)
46 #   define to_mask(msk) __builtin_ia32_ptestmq512(widen(msk), widen(msk), ~0)
47 #   define eq(x, y) (__builtin_ia32_pcmpeqq512_mask(widen(x), widen(y), ~0) == ALL_TRUE)
48 #  endif
49 #  define BG_(dt, it, reg, mem, idx, msk, scl) \
50     __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
51 #  define BS_(dt, it, mem, idx, reg, msk, scl) \
52     __builtin_ia32_scatter##it##dt(mem, to_mask(msk), idx, reg, scl)
53 # else
54 #  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
55 #  define BG_(dt, it, reg, mem, idx, msk, scl) \
56     __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
57 #  define BS_(dt, it, mem, idx, reg, msk, scl) \
58     __builtin_ia32_scatter##it##dt(mem, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), idx, reg, scl)
59 # endif
60 /*
61  * Instead of replicating the main IDX_SIZE conditional below three times, use
62  * a double layer of macro invocations, allowing for substitution of the
63  * respective relevant macro argument tokens.
64  */
65 # define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
66 # define BS(dt, it, mem, idx, reg, msk, scl) BS_(dt, it##i, mem, idx, reg, msk, scl)
67 # if VEC_MAX < 64
68 /*
69  * The sub-512-bit built-ins have an extra "3" infix, presumably because the
70  * 512-bit names were chosen without the AVX512VL extension in mind (and hence
71  * making the latter collide with the AVX2 ones).
72  */
73 #  define si 3si
74 #  define di 3di
75 # endif
76 # if VEC_MAX == 16
77 #  define v8df v2df
78 #  define v8di v2di
79 #  define v16sf v4sf
80 #  define v16si v4si
81 # elif VEC_MAX == 32
82 #  define v8df v4df
83 #  define v8di v4di
84 #  define v16sf v8sf
85 #  define v16si v8si
86 # endif
87 # if IDX_SIZE == 4
88 #  if INT_SIZE == 4
89 #   define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
90 #   define scatter(mem, idx, reg, msk, scl) BS(v16si, s, mem, idx, reg, msk, scl)
91 #  elif INT_SIZE == 8
92 #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
93 #   define scatter(mem, idx, reg, msk, scl) BS(v8di, s, mem, idx, (vdi_t)(reg), msk, scl)
94 #  elif FLOAT_SIZE == 4
95 #   define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
96 #   define scatter(mem, idx, reg, msk, scl) BS(v16sf, s, mem, idx, reg, msk, scl)
97 #  elif FLOAT_SIZE == 8
98 #   define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
99 #   define scatter(mem, idx, reg, msk, scl) BS(v8df, s, mem, idx, reg, msk, scl)
100 #  endif
101 # elif IDX_SIZE == 8
102 #  if INT_SIZE == 4
103 #   define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
104 #   define scatter(mem, idx, reg, msk, scl) BS(v16si, d, mem, (idi_t)(idx), reg, msk, scl)
105 #  elif INT_SIZE == 8
106 #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
107 #   define scatter(mem, idx, reg, msk, scl) BS(v8di, d, mem, (idi_t)(idx), (vdi_t)(reg), msk, scl)
108 #  elif FLOAT_SIZE == 4
109 #   define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
110 #   define scatter(mem, idx, reg, msk, scl) BS(v16sf, d, mem, (idi_t)(idx), reg, msk, scl)
111 #  elif FLOAT_SIZE == 8
112 #   define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
113 #   define scatter(mem, idx, reg, msk, scl) BS(v8df, d, mem, (idi_t)(idx), reg, msk, scl)
114 #  endif
115 # endif
116 #elif defined(__AVX2__)
117 # if VEC_SIZE == 16
118 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
119 # else
120 #  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
121 # endif
122 
123 # if VEC_MAX == 16
124 #  if IDX_SIZE == 4
125 #   if INT_SIZE == 4
126 #    define gather __builtin_ia32_gathersiv4si
127 #   elif INT_SIZE == 8
128 #    define gather(reg, mem, idx, msk, scl) \
129             (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \
130                                                 (const void *)(mem), \
131                                                 idx, (vdi_t)(msk), scl))
132 #   elif FLOAT_SIZE == 4
133 #    define gather __builtin_ia32_gathersiv4sf
134 #   elif FLOAT_SIZE == 8
135 #    define gather __builtin_ia32_gathersiv2df
136 #   endif
137 #  elif IDX_SIZE == 8
138 #   if INT_SIZE == 4
139 #    define gather(reg, mem, idx, msk, scl) \
140             __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl)
141 #   elif INT_SIZE == 8
142 #    define gather(reg, mem, idx, msk, scl) \
143             (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \
144                                                 (const void *)(mem), \
145                                                 (vdi_t)(idx), (vdi_t)(msk), \
146                                                 scl))
147 #   elif FLOAT_SIZE == 4
148 #    define gather(reg, mem, idx, msk, scl) \
149             __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl)
150 #   elif FLOAT_SIZE == 8
151 #    define gather(reg, mem, idx, msk, scl) \
152             __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl)
153 #   endif
154 #  endif
155 # elif VEC_MAX == 32
156 #  if IDX_SIZE == 4
157 #   if INT_SIZE == 4
158 #    define gather __builtin_ia32_gathersiv8si
159 #   elif INT_SIZE == 8
160 #    define gather(reg, mem, idx, msk, scl) \
161             (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \
162                                                 (const void *)(mem), \
163                                                 idx, (vdi_t)(msk), scl))
164 
165 #   elif FLOAT_SIZE == 4
166 #    define gather __builtin_ia32_gathersiv8sf
167 #   elif FLOAT_SIZE == 8
168 #    define gather __builtin_ia32_gathersiv4df
169 #   endif
170 #  elif IDX_SIZE == 8
171 #   if INT_SIZE == 4
172 #    define gather(reg, mem, idx, msk, scl) \
173             __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl)
174 #   elif INT_SIZE == 8
175 #    define gather(reg, mem, idx, msk, scl) \
176             (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \
177                                                 (const void *)(mem), \
178                                                 (vdi_t)(idx), (vdi_t)(msk), \
179                                                 scl))
180 
181 #   elif FLOAT_SIZE == 4
182 #    define gather(reg, mem, idx, msk, scl) \
183             __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl)
184 #   elif FLOAT_SIZE == 8
185 #    define gather(reg, mem, idx, msk, scl) \
186             __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl)
187 #   endif
188 #  endif
189 # endif
190 #endif
191 
192 #ifndef eq
193 # define eq(x, y) to_bool((x) == (y))
194 #endif
195 
196 #define GLUE_(x, y) x ## y
197 #define GLUE(x, y) GLUE_(x, y)
198 
199 #define PUT2(n)      (n),        (n) +  1
200 #define PUT4(n)  PUT2(n),   PUT2((n) +  2)
201 #define PUT8(n)  PUT4(n),   PUT4((n) +  4)
202 #define PUT16(n) PUT8(n),   PUT8((n) +  8)
203 #define PUT32(n) PUT16(n), PUT16((n) + 16)
204 #define PUT64(n) PUT32(n), PUT32((n) + 32)
205 
206 const typeof((vec_t){}[0]) array[] = {
207     GLUE(PUT, VEC_MAX)(1),
208     GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
209 };
210 
211 typeof((vec_t){}[0]) out[VEC_MAX * 2];
212 
sg_test(void)213 int sg_test(void)
214 {
215     unsigned int i;
216     vec_t x, y, full = (vec_t){} == 0;
217     idx_t idx, inv;
218 
219     for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i )
220     {
221         idx[i] = i + 1;
222         inv[i] = ITEM_COUNT - i;
223     }
224 
225     touch(idx);
226     touch(inv);
227 
228     x = gather(full, array, (idx_t){}, full, 1);
229     for ( i = 0; i < ITEM_COUNT; ++i )
230         if ( x[i] != 1 )
231             return __LINE__;
232     for ( ; i < ELEM_COUNT; ++i )
233         if ( x[i] )
234             return __LINE__;
235 
236     x = gather(full, array, idx, full, ELEM_SIZE);
237     for ( i = 0; i < ITEM_COUNT; ++i )
238         if ( x[i] != i + 2 )
239             return __LINE__;
240     for ( ; i < ELEM_COUNT; ++i )
241         if ( x[i] )
242             return __LINE__;
243 
244     x = gather(full, array, idx * ELEM_SIZE, full, 2);
245     for ( i = 0; i < ITEM_COUNT; ++i )
246         if ( x[i] != i * 2 + 3 )
247             return __LINE__;
248     for ( ; i < ELEM_COUNT; ++i )
249         if ( x[i] )
250             return __LINE__;
251 
252     x = gather(full, array, inv, full, ELEM_SIZE);
253     for ( i = 0; i < ITEM_COUNT; ++i )
254         if ( x[i] != inv[i] + 1 )
255             return __LINE__;
256     for ( ; i < ELEM_COUNT; ++i )
257         if ( x[i] )
258             return __LINE__;
259 
260     y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
261 #if ITEM_COUNT == ELEM_COUNT
262     if ( !eq(y, x - 1) )
263         return __LINE__;
264 #else
265     for ( i = 0; i < ITEM_COUNT; ++i )
266         if ( y[i] != x[i] - 1 )
267             return __LINE__;
268     for ( ; i < ELEM_COUNT; ++i )
269         if ( y[i] )
270             return __LINE__;
271 #endif
272 
273 #if ELEM_SIZE > 1
274     x = gather(full, array, inv * 2, full, ELEM_SIZE / 2);
275     for ( i = 0; i < ITEM_COUNT; ++i )
276         if ( x[i] != inv[i] + 1 )
277             return __LINE__;
278     for ( ; i < ELEM_COUNT; ++i )
279         if ( x[i] )
280             return __LINE__;
281 
282 # if ELEM_SIZE == IDX_SIZE
283     y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE);
284     for ( i = 0; i < ITEM_COUNT; ++i )
285         if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 )
286             return __LINE__;
287     for ( ; i < ELEM_COUNT; ++i )
288         if ( y[i] )
289             return __LINE__;
290 # endif
291 #endif
292 
293 #ifdef scatter
294 
295     for ( i = 0; i < sizeof(out) / sizeof(*out); ++i )
296         out[i] = 0;
297 
298     for ( i = 0; i < ITEM_COUNT; ++i )
299         x[i] = i + 1;
300 
301     touch(x);
302 
303     scatter(out, (idx_t){}, x, (vec_t){ 1 } != 0, 1);
304     if ( out[0] != 1 )
305         return __LINE__;
306     for ( i = 1; i < ITEM_COUNT; ++i )
307         if ( out[i] )
308             return __LINE__;
309 
310     scatter(out, (idx_t){}, x, full, 1);
311     if ( out[0] != ITEM_COUNT )
312         return __LINE__;
313     for ( i = 1; i < ITEM_COUNT; ++i )
314         if ( out[i] )
315             return __LINE__;
316 
317     scatter(out, idx, x, full, ELEM_SIZE);
318     for ( i = 1; i <= ITEM_COUNT; ++i )
319         if ( out[i] != i )
320             return __LINE__;
321 
322     scatter(out, inv, x, full, ELEM_SIZE);
323     for ( i = 1; i <= ITEM_COUNT; ++i )
324         if ( out[i] != ITEM_COUNT + 1 - i )
325             return __LINE__;
326 
327 #endif
328 
329     return 0;
330 }
331