1 #define UINT_SIZE 8
2
3 #include "simd.h"
4 ENTRY(clmul_test);
5
6 #ifdef __AVX512F__ /* AVX512BW may get enabled only below */
7 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
8 # define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
9 # define lane_shr_unit(x) \
10 ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), 64, (vdi_t){}, \
11 0x00ff00ff00ff00ffULL & (~0ULL >> (64 - VEC_SIZE))))
12 #else
13 # if defined(__AVX2__) && VEC_SIZE == 32
14 # define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
15 # else
16 # define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
17 # endif
18 # define eq(x, y) to_bool((x) == (y))
19 # define lane_shr_unit(x) ((vec_t)B(palignr, , (vdi_t){}, (vdi_t)(x), 64))
20 #endif
21
22 #define CLMUL(op, x, y, c) (vec_t)(__builtin_ia32_ ## op((vdi_t)(x), (vdi_t)(y), c))
23
24 #if VEC_SIZE == 16
25 # define clmul(x, y, c) CLMUL(pclmulqdq128, x, y, c)
26 # define vpshrd __builtin_ia32_vpshrd_v2di
27 #elif VEC_SIZE == 32
28 # define clmul(x, y, c) CLMUL(vpclmulqdq_v4di, x, y, c)
29 # define vpshrd __builtin_ia32_vpshrd_v4di
30 #elif VEC_SIZE == 64
31 # define clmul(x, y, c) CLMUL(vpclmulqdq_v8di, x, y, c)
32 # define vpshrd __builtin_ia32_vpshrd_v8di
33 #endif
34
35 #define clmul_ll(x, y) clmul(x, y, 0x00)
36 #define clmul_hl(x, y) clmul(x, y, 0x01)
37 #define clmul_lh(x, y) clmul(x, y, 0x10)
38 #define clmul_hh(x, y) clmul(x, y, 0x11)
39
40 #if defined(__AVX512VBMI2__)
41 # pragma GCC target ( "avx512bw" )
42 # define lane_shr_i(x, n) ({ \
43 vec_t h_ = lane_shr_unit(x); \
44 touch(h_); \
45 (n) < 64 ? (vec_t)vpshrd((vdi_t)(x), (vdi_t)(h_), n) : h_ >> ((n) - 64); \
46 })
47 # define lane_shr_v(x, n) ({ \
48 vec_t t_ = (x), h_ = lane_shr_unit(x); \
49 typeof(t_[0]) n_ = (n); \
50 if ( (n) < 64 ) \
51 /* gcc does not support embedded broadcast */ \
52 asm ( "vpshrdvq %2%{1to%c3%}, %1, %0" \
53 : "+v" (t_) : "v" (h_), "m" (n_), "i" (ELEM_COUNT) ); \
54 else \
55 t_ = h_ >> ((n) - 64); \
56 t_; \
57 })
58 #else
59 # define lane_shr_i lane_shr_v
60 # define lane_shr_v(x, n) ({ \
61 vec_t t_ = (n) > 0 ? lane_shr_unit(x) : (x); \
62 (n) < 64 ? ((x) >> (n)) | (t_ << (-(n) & 0x3f)) \
63 : t_ >> ((n) - 64); \
64 })
65 #endif
66
clmul_test(void)67 int clmul_test(void)
68 {
69 unsigned int i;
70 vec_t src;
71 vqi_t raw = {};
72
73 for ( i = 1; i < VEC_SIZE; ++i )
74 raw[i] = i;
75 src = (vec_t)raw;
76
77 for ( i = 0; i < 256; i += VEC_SIZE )
78 {
79 vec_t x = {}, y, z, lo, hi;
80 unsigned int j;
81
82 touch(x);
83 y = clmul_ll(src, x);
84 touch(x);
85
86 if ( !eq(y, x) ) return __LINE__;
87
88 for ( j = 0; j < ELEM_COUNT; j += 2 )
89 x[j] = 1;
90
91 touch(src);
92 y = clmul_ll(x, src);
93 touch(src);
94 z = clmul_lh(x, src);
95 touch(src);
96
97 for ( j = 0; j < ELEM_COUNT; j += 2 )
98 y[j + 1] = z[j];
99
100 if ( !eq(y, src) ) return __LINE__;
101
102 /*
103 * Besides the obvious property of the low and high half products
104 * being the same either direction, the "square" of a number has the
105 * property of simply being the original bit pattern with a zero bit
106 * inserted between any two bits. This is what the code below checks.
107 */
108
109 x = src;
110 touch(src);
111 y = clmul_lh(x, src);
112 touch(src);
113 z = clmul_hl(x, src);
114
115 if ( !eq(y, z) ) return __LINE__;
116
117 touch(src);
118 y = lo = clmul_ll(x, src);
119 touch(src);
120 z = hi = clmul_hh(x, src);
121 touch(src);
122
123 for ( j = 0; j < 64; ++j )
124 {
125 vec_t l = lane_shr_v(lo, 2 * j);
126 vec_t h = lane_shr_v(hi, 2 * j);
127 unsigned int n;
128
129 if ( !eq(l, y) ) return __LINE__;
130 if ( !eq(h, z) ) return __LINE__;
131
132 x = src >> j;
133
134 for ( n = 0; n < ELEM_COUNT; n += 2 )
135 {
136 if ( (x[n + 0] & 1) != (l[n] & 3) ) return __LINE__;
137 if ( (x[n + 1] & 1) != (h[n] & 3) ) return __LINE__;
138 }
139
140 touch(y);
141 y = lane_shr_i(y, 2);
142 touch(z);
143 z = lane_shr_i(z, 2);
144 }
145
146 src += 0x0101010101010101ULL * VEC_SIZE;
147 }
148
149 return 0;
150 }
151