1 #define UINT_SIZE 8
2 
3 #include "simd.h"
4 ENTRY(clmul_test);
5 
6 #ifdef __AVX512F__ /* AVX512BW may get enabled only below */
7 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
8 # define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
9 # define lane_shr_unit(x) \
10     ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), 64, (vdi_t){}, \
11               0x00ff00ff00ff00ffULL & (~0ULL >> (64 - VEC_SIZE))))
12 #else
13 # if defined(__AVX2__) && VEC_SIZE == 32
14 #  define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
15 # else
16 #  define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
17 # endif
18 # define eq(x, y) to_bool((x) == (y))
19 # define lane_shr_unit(x) ((vec_t)B(palignr, , (vdi_t){}, (vdi_t)(x), 64))
20 #endif
21 
22 #define CLMUL(op, x, y, c) (vec_t)(__builtin_ia32_ ## op((vdi_t)(x), (vdi_t)(y), c))
23 
24 #if VEC_SIZE == 16
25 # define clmul(x, y, c) CLMUL(pclmulqdq128, x, y, c)
26 # define vpshrd __builtin_ia32_vpshrd_v2di
27 #elif VEC_SIZE == 32
28 # define clmul(x, y, c) CLMUL(vpclmulqdq_v4di, x, y, c)
29 # define vpshrd __builtin_ia32_vpshrd_v4di
30 #elif VEC_SIZE == 64
31 # define clmul(x, y, c) CLMUL(vpclmulqdq_v8di, x, y, c)
32 # define vpshrd __builtin_ia32_vpshrd_v8di
33 #endif
34 
35 #define clmul_ll(x, y) clmul(x, y, 0x00)
36 #define clmul_hl(x, y) clmul(x, y, 0x01)
37 #define clmul_lh(x, y) clmul(x, y, 0x10)
38 #define clmul_hh(x, y) clmul(x, y, 0x11)
39 
40 #if defined(__AVX512VBMI2__)
41 # pragma GCC target ( "avx512bw" )
42 # define lane_shr_i(x, n) ({ \
43     vec_t h_ = lane_shr_unit(x); \
44     touch(h_); \
45     (n) < 64 ? (vec_t)vpshrd((vdi_t)(x), (vdi_t)(h_), n) : h_ >> ((n) - 64); \
46 })
47 # define lane_shr_v(x, n) ({ \
48     vec_t t_ = (x), h_ = lane_shr_unit(x); \
49     typeof(t_[0]) n_ = (n); \
50     if ( (n) < 64 ) \
51         /* gcc does not support embedded broadcast */ \
52         asm ( "vpshrdvq %2%{1to%c3%}, %1, %0" \
53               : "+v" (t_) : "v" (h_), "m" (n_), "i" (ELEM_COUNT) ); \
54     else \
55         t_ = h_ >> ((n) - 64); \
56     t_; \
57 })
58 #else
59 # define lane_shr_i lane_shr_v
60 # define lane_shr_v(x, n) ({ \
61     vec_t t_ = (n) > 0 ? lane_shr_unit(x) : (x); \
62     (n) < 64 ? ((x) >> (n)) | (t_ << (-(n) & 0x3f)) \
63              : t_ >> ((n) - 64); \
64 })
65 #endif
66 
clmul_test(void)67 int clmul_test(void)
68 {
69     unsigned int i;
70     vec_t src;
71     vqi_t raw = {};
72 
73     for ( i = 1; i < VEC_SIZE; ++i )
74         raw[i] = i;
75     src = (vec_t)raw;
76 
77     for ( i = 0; i < 256; i += VEC_SIZE )
78     {
79         vec_t x = {}, y, z, lo, hi;
80         unsigned int j;
81 
82         touch(x);
83         y = clmul_ll(src, x);
84         touch(x);
85 
86         if ( !eq(y, x) ) return __LINE__;
87 
88         for ( j = 0; j < ELEM_COUNT; j += 2 )
89             x[j] = 1;
90 
91         touch(src);
92         y = clmul_ll(x, src);
93         touch(src);
94         z = clmul_lh(x, src);
95         touch(src);
96 
97         for ( j = 0; j < ELEM_COUNT; j += 2 )
98             y[j + 1] = z[j];
99 
100         if ( !eq(y, src) ) return __LINE__;
101 
102         /*
103          * Besides the obvious property of the low and high half products
104          * being the same either direction, the "square" of a number has the
105          * property of simply being the original bit pattern with a zero bit
106          * inserted between any two bits. This is what the code below checks.
107          */
108 
109         x = src;
110         touch(src);
111         y = clmul_lh(x, src);
112         touch(src);
113         z = clmul_hl(x, src);
114 
115         if ( !eq(y, z) ) return __LINE__;
116 
117         touch(src);
118         y = lo = clmul_ll(x, src);
119         touch(src);
120         z = hi = clmul_hh(x, src);
121         touch(src);
122 
123         for ( j = 0; j < 64; ++j )
124         {
125             vec_t l = lane_shr_v(lo, 2 * j);
126             vec_t h = lane_shr_v(hi, 2 * j);
127             unsigned int n;
128 
129             if ( !eq(l, y) ) return __LINE__;
130             if ( !eq(h, z) ) return __LINE__;
131 
132             x = src >> j;
133 
134             for ( n = 0; n < ELEM_COUNT; n += 2 )
135             {
136                 if ( (x[n + 0] & 1) != (l[n] & 3) ) return __LINE__;
137                 if ( (x[n + 1] & 1) != (h[n] & 3) ) return __LINE__;
138             }
139 
140             touch(y);
141             y = lane_shr_i(y, 2);
142             touch(z);
143             z = lane_shr_i(z, 2);
144         }
145 
146         src += 0x0101010101010101ULL * VEC_SIZE;
147     }
148 
149     return 0;
150 }
151