1 #ifdef INT_SIZE
2 # define ELEM_SIZE INT_SIZE
3 #else
4 # define ELEM_SIZE FLOAT_SIZE
5 #endif
6
7 #define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \
8 : VEC_MAX * ELEM_SIZE / IDX_SIZE)
9 #if VEC_SIZE < 16
10 # undef VEC_SIZE
11 # define VEC_SIZE 16
12 #endif
13
14 #include "simd.h"
15
16 ENTRY(sg_test);
17
18 #undef MODE
19 #if IDX_SIZE == 4
20 # define MODE SI
21 #elif IDX_SIZE == 8
22 # define MODE DI
23 #endif
24
25 #define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \
26 : VEC_MAX * IDX_SIZE / ELEM_SIZE)
27 #if IVEC_SIZE < 16
28 # undef IVEC_SIZE
29 # define IVEC_SIZE 16
30 #endif
31
32 typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t;
33 typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t;
34
35 #define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
36 VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
37
38 #if defined(__AVX512F__)
39 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
40 # if ELEM_SIZE == 4
41 # if IDX_SIZE == 4 || defined(__AVX512VL__)
42 # define to_mask(msk) B(ptestmd, , (vsi_t)(msk), (vsi_t)(msk), ~0)
43 # define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
44 # else
45 # define widen(x) __builtin_ia32_pmovzxdq512_mask((vsi_t)(x), (idi_t){}, ~0)
46 # define to_mask(msk) __builtin_ia32_ptestmq512(widen(msk), widen(msk), ~0)
47 # define eq(x, y) (__builtin_ia32_pcmpeqq512_mask(widen(x), widen(y), ~0) == ALL_TRUE)
48 # endif
49 # define BG_(dt, it, reg, mem, idx, msk, scl) \
50 __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
51 # define BS_(dt, it, mem, idx, reg, msk, scl) \
52 __builtin_ia32_scatter##it##dt(mem, to_mask(msk), idx, reg, scl)
53 # else
54 # define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
55 # define BG_(dt, it, reg, mem, idx, msk, scl) \
56 __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
57 # define BS_(dt, it, mem, idx, reg, msk, scl) \
58 __builtin_ia32_scatter##it##dt(mem, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), idx, reg, scl)
59 # endif
60 /*
61 * Instead of replicating the main IDX_SIZE conditional below three times, use
62 * a double layer of macro invocations, allowing for substitution of the
63 * respective relevant macro argument tokens.
64 */
65 # define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
66 # define BS(dt, it, mem, idx, reg, msk, scl) BS_(dt, it##i, mem, idx, reg, msk, scl)
67 # if VEC_MAX < 64
68 /*
69 * The sub-512-bit built-ins have an extra "3" infix, presumably because the
70 * 512-bit names were chosen without the AVX512VL extension in mind (and hence
71 * making the latter collide with the AVX2 ones).
72 */
73 # define si 3si
74 # define di 3di
75 # endif
76 # if VEC_MAX == 16
77 # define v8df v2df
78 # define v8di v2di
79 # define v16sf v4sf
80 # define v16si v4si
81 # elif VEC_MAX == 32
82 # define v8df v4df
83 # define v8di v4di
84 # define v16sf v8sf
85 # define v16si v8si
86 # endif
87 # if IDX_SIZE == 4
88 # if INT_SIZE == 4
89 # define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
90 # define scatter(mem, idx, reg, msk, scl) BS(v16si, s, mem, idx, reg, msk, scl)
91 # elif INT_SIZE == 8
92 # define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
93 # define scatter(mem, idx, reg, msk, scl) BS(v8di, s, mem, idx, (vdi_t)(reg), msk, scl)
94 # elif FLOAT_SIZE == 4
95 # define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
96 # define scatter(mem, idx, reg, msk, scl) BS(v16sf, s, mem, idx, reg, msk, scl)
97 # elif FLOAT_SIZE == 8
98 # define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
99 # define scatter(mem, idx, reg, msk, scl) BS(v8df, s, mem, idx, reg, msk, scl)
100 # endif
101 # elif IDX_SIZE == 8
102 # if INT_SIZE == 4
103 # define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
104 # define scatter(mem, idx, reg, msk, scl) BS(v16si, d, mem, (idi_t)(idx), reg, msk, scl)
105 # elif INT_SIZE == 8
106 # define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
107 # define scatter(mem, idx, reg, msk, scl) BS(v8di, d, mem, (idi_t)(idx), (vdi_t)(reg), msk, scl)
108 # elif FLOAT_SIZE == 4
109 # define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
110 # define scatter(mem, idx, reg, msk, scl) BS(v16sf, d, mem, (idi_t)(idx), reg, msk, scl)
111 # elif FLOAT_SIZE == 8
112 # define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
113 # define scatter(mem, idx, reg, msk, scl) BS(v8df, d, mem, (idi_t)(idx), reg, msk, scl)
114 # endif
115 # endif
116 #elif defined(__AVX2__)
117 # if VEC_SIZE == 16
118 # define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
119 # else
120 # define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
121 # endif
122
123 # if VEC_MAX == 16
124 # if IDX_SIZE == 4
125 # if INT_SIZE == 4
126 # define gather __builtin_ia32_gathersiv4si
127 # elif INT_SIZE == 8
128 # define gather(reg, mem, idx, msk, scl) \
129 (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \
130 (const void *)(mem), \
131 idx, (vdi_t)(msk), scl))
132 # elif FLOAT_SIZE == 4
133 # define gather __builtin_ia32_gathersiv4sf
134 # elif FLOAT_SIZE == 8
135 # define gather __builtin_ia32_gathersiv2df
136 # endif
137 # elif IDX_SIZE == 8
138 # if INT_SIZE == 4
139 # define gather(reg, mem, idx, msk, scl) \
140 __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl)
141 # elif INT_SIZE == 8
142 # define gather(reg, mem, idx, msk, scl) \
143 (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \
144 (const void *)(mem), \
145 (vdi_t)(idx), (vdi_t)(msk), \
146 scl))
147 # elif FLOAT_SIZE == 4
148 # define gather(reg, mem, idx, msk, scl) \
149 __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl)
150 # elif FLOAT_SIZE == 8
151 # define gather(reg, mem, idx, msk, scl) \
152 __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl)
153 # endif
154 # endif
155 # elif VEC_MAX == 32
156 # if IDX_SIZE == 4
157 # if INT_SIZE == 4
158 # define gather __builtin_ia32_gathersiv8si
159 # elif INT_SIZE == 8
160 # define gather(reg, mem, idx, msk, scl) \
161 (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \
162 (const void *)(mem), \
163 idx, (vdi_t)(msk), scl))
164
165 # elif FLOAT_SIZE == 4
166 # define gather __builtin_ia32_gathersiv8sf
167 # elif FLOAT_SIZE == 8
168 # define gather __builtin_ia32_gathersiv4df
169 # endif
170 # elif IDX_SIZE == 8
171 # if INT_SIZE == 4
172 # define gather(reg, mem, idx, msk, scl) \
173 __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl)
174 # elif INT_SIZE == 8
175 # define gather(reg, mem, idx, msk, scl) \
176 (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \
177 (const void *)(mem), \
178 (vdi_t)(idx), (vdi_t)(msk), \
179 scl))
180
181 # elif FLOAT_SIZE == 4
182 # define gather(reg, mem, idx, msk, scl) \
183 __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl)
184 # elif FLOAT_SIZE == 8
185 # define gather(reg, mem, idx, msk, scl) \
186 __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl)
187 # endif
188 # endif
189 # endif
190 #endif
191
192 #ifndef eq
193 # define eq(x, y) to_bool((x) == (y))
194 #endif
195
196 #define GLUE_(x, y) x ## y
197 #define GLUE(x, y) GLUE_(x, y)
198
199 #define PUT2(n) (n), (n) + 1
200 #define PUT4(n) PUT2(n), PUT2((n) + 2)
201 #define PUT8(n) PUT4(n), PUT4((n) + 4)
202 #define PUT16(n) PUT8(n), PUT8((n) + 8)
203 #define PUT32(n) PUT16(n), PUT16((n) + 16)
204 #define PUT64(n) PUT32(n), PUT32((n) + 32)
205
206 const typeof((vec_t){}[0]) array[] = {
207 GLUE(PUT, VEC_MAX)(1),
208 GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
209 };
210
211 typeof((vec_t){}[0]) out[VEC_MAX * 2];
212
sg_test(void)213 int sg_test(void)
214 {
215 unsigned int i;
216 vec_t x, y, full = (vec_t){} == 0;
217 idx_t idx, inv;
218
219 for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i )
220 {
221 idx[i] = i + 1;
222 inv[i] = ITEM_COUNT - i;
223 }
224
225 touch(idx);
226 touch(inv);
227
228 x = gather(full, array, (idx_t){}, full, 1);
229 for ( i = 0; i < ITEM_COUNT; ++i )
230 if ( x[i] != 1 )
231 return __LINE__;
232 for ( ; i < ELEM_COUNT; ++i )
233 if ( x[i] )
234 return __LINE__;
235
236 x = gather(full, array, idx, full, ELEM_SIZE);
237 for ( i = 0; i < ITEM_COUNT; ++i )
238 if ( x[i] != i + 2 )
239 return __LINE__;
240 for ( ; i < ELEM_COUNT; ++i )
241 if ( x[i] )
242 return __LINE__;
243
244 x = gather(full, array, idx * ELEM_SIZE, full, 2);
245 for ( i = 0; i < ITEM_COUNT; ++i )
246 if ( x[i] != i * 2 + 3 )
247 return __LINE__;
248 for ( ; i < ELEM_COUNT; ++i )
249 if ( x[i] )
250 return __LINE__;
251
252 x = gather(full, array, inv, full, ELEM_SIZE);
253 for ( i = 0; i < ITEM_COUNT; ++i )
254 if ( x[i] != inv[i] + 1 )
255 return __LINE__;
256 for ( ; i < ELEM_COUNT; ++i )
257 if ( x[i] )
258 return __LINE__;
259
260 y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
261 #if ITEM_COUNT == ELEM_COUNT
262 if ( !eq(y, x - 1) )
263 return __LINE__;
264 #else
265 for ( i = 0; i < ITEM_COUNT; ++i )
266 if ( y[i] != x[i] - 1 )
267 return __LINE__;
268 for ( ; i < ELEM_COUNT; ++i )
269 if ( y[i] )
270 return __LINE__;
271 #endif
272
273 #if ELEM_SIZE > 1
274 x = gather(full, array, inv * 2, full, ELEM_SIZE / 2);
275 for ( i = 0; i < ITEM_COUNT; ++i )
276 if ( x[i] != inv[i] + 1 )
277 return __LINE__;
278 for ( ; i < ELEM_COUNT; ++i )
279 if ( x[i] )
280 return __LINE__;
281
282 # if ELEM_SIZE == IDX_SIZE
283 y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE);
284 for ( i = 0; i < ITEM_COUNT; ++i )
285 if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 )
286 return __LINE__;
287 for ( ; i < ELEM_COUNT; ++i )
288 if ( y[i] )
289 return __LINE__;
290 # endif
291 #endif
292
293 #ifdef scatter
294
295 for ( i = 0; i < sizeof(out) / sizeof(*out); ++i )
296 out[i] = 0;
297
298 for ( i = 0; i < ITEM_COUNT; ++i )
299 x[i] = i + 1;
300
301 touch(x);
302
303 scatter(out, (idx_t){}, x, (vec_t){ 1 } != 0, 1);
304 if ( out[0] != 1 )
305 return __LINE__;
306 for ( i = 1; i < ITEM_COUNT; ++i )
307 if ( out[i] )
308 return __LINE__;
309
310 scatter(out, (idx_t){}, x, full, 1);
311 if ( out[0] != ITEM_COUNT )
312 return __LINE__;
313 for ( i = 1; i < ITEM_COUNT; ++i )
314 if ( out[i] )
315 return __LINE__;
316
317 scatter(out, idx, x, full, ELEM_SIZE);
318 for ( i = 1; i <= ITEM_COUNT; ++i )
319 if ( out[i] != i )
320 return __LINE__;
321
322 scatter(out, inv, x, full, ELEM_SIZE);
323 for ( i = 1; i <= ITEM_COUNT; ++i )
324 if ( out[i] != ITEM_COUNT + 1 - i )
325 return __LINE__;
326
327 #endif
328
329 return 0;
330 }
331