1 #define INT_SIZE 4
2 
3 #include "simd.h"
4 ENTRY(sha_test);
5 
6 #define SHA(op, a...) __builtin_ia32_sha ## op(a)
7 
8 #ifdef __AVX512F__
9 # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
10 # define eq(x, y) (B(pcmpeqd, _mask, x, y, -1) == ALL_TRUE)
11 # define blend(x, y, sel) B(movdqa32_, _mask, y, x, sel)
12 # define rot_c(f, r, x, n) B(pro ## f ## d, _mask, x, n, undef(), ~0)
13 # define rot_s(f, r, x, n) ({ /* gcc does not support embedded broadcast */ \
14     vec_t r_; \
15     asm ( "vpro" #f "vd %2%{1to%c3%}, %1, %0" \
16           : "=v" (r_) \
17           : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \
18     r_; \
19 })
20 # define rot_v(d, x, n) B(pro ## d ## vd, _mask, x, n, undef(), ~0)
21 # define shift_s(d, x, n) ({ \
22     vec_t r_; \
23     asm ( "vps" #d "lvd %2%{1to%c3%}, %1, %0" \
24           : "=v" (r_) \
25           : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \
26     r_; \
27 })
28 # define vshift(d, x, n) ({ /* gcc does not allow memory operands */ \
29     vec_t r_; \
30     asm ( "vps" #d "ldq %2, %1, %0" \
31           : "=v" (r_) : "m" (x), "i" ((n) * ELEM_SIZE) ); \
32     r_; \
33 })
34 #else
35 # define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
36 # define eq(x, y) to_bool((x) == (y))
37 # define blend(x, y, sel) \
38     ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), \
39                                       ((sel) & 1 ? 0x03 : 0) | \
40                                       ((sel) & 2 ? 0x0c : 0) | \
41                                       ((sel) & 4 ? 0x30 : 0) | \
42                                       ((sel) & 8 ? 0xc0 : 0)))
43 # define rot_c(f, r, x, n) (sh ## f ## _c(x, n) | sh ## r ## _c(x, 32 - (n)))
44 # define rot_s(f, r, x, n) ({ /* gcc does not allow memory operands */ \
45     vec_t r_, t_, n_ = (vec_t){ 32 } - (n); \
46     asm ( "ps" #f "ld %2, %0; ps" #r "ld %3, %1; por %1, %0" \
47           : "=&x" (r_), "=&x" (t_) \
48           : "m" (n), "m" (n_), "0" (x), "1" (x) ); \
49     r_; \
50 })
rotl(unsigned int x,unsigned int n)51 static inline unsigned int rotl(unsigned int x, unsigned int n)
52 {
53     return (x << (n & 0x1f)) | (x >> ((32 - n) & 0x1f));
54 }
rotr(unsigned int x,unsigned int n)55 static inline unsigned int rotr(unsigned int x, unsigned int n)
56 {
57     return (x >> (n & 0x1f)) | (x << ((32 - n) & 0x1f));
58 }
59 # define rot_v(d, x, n) ({ \
60     vec_t t_; \
61     unsigned int i_; \
62     for ( i_ = 0; i_ < ELEM_COUNT; ++i_ ) \
63         t_[i_] = rot ## d((x)[i_], (n)[i_]); \
64     t_; \
65 })
66 # define shift_s(d, x, n) ({ \
67     vec_t r_; \
68     asm ( "ps" #d "ld %1, %0" : "=&x" (r_) : "m" (n), "0" (x) ); \
69     r_; \
70 })
71 # define vshift(d, x, n) \
72     (vec_t)(__builtin_ia32_ps ## d ## ldqi128((vdi_t)(x), (n) * ELEM_SIZE * 8))
73 #endif
74 
75 #define alignr(x, y, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(y), (n) * 8))
76 #define hadd(x, y) __builtin_ia32_phaddd128(x, y)
77 #define rol_c(x, n) rot_c(l, r, x, n)
78 #define rol_s(x, n) rot_s(l, r, x, n)
79 #define rol_v(x, n...) rot_v(l, x, n)
80 #define ror_c(x, n) rot_c(r, l, x, n)
81 #define ror_s(x, n) rot_s(r, l, x, n)
82 #define ror_v(x, n...) rot_v(r, x, n)
83 #define shl_c(x, n) __builtin_ia32_pslldi128(x, n)
84 #define shl_s(x, n) shift_s(l, x, n)
85 #define shr_c(x, n) __builtin_ia32_psrldi128(x, n)
86 #define shr_s(x, n) shift_s(r, x, n)
87 #define shuf(x, s) __builtin_ia32_pshufd(x, s)
88 #define swap(x) shuf(x, 0b00011011)
89 #define vshl(x, n) vshift(l, x, n)
90 #define vshr(x, n) vshift(r, x, n)
91 
sha256_sigma0(vec_t w)92 static inline vec_t sha256_sigma0(vec_t w)
93 {
94     vec_t res;
95 
96     touch(w);
97     res = ror_c(w, 7);
98     touch(w);
99     res ^= rol_c(w, 14);
100     touch(w);
101     res ^= shr_c(w, 3);
102     touch(w);
103 
104     return res;
105 }
106 
sha256_sigma1(vec_t w)107 static inline vec_t sha256_sigma1(vec_t w)
108 {
109     vec_t _17 = { 17 }, _19 = { 19 }, _10 = { 10 };
110 
111     return ror_s(w, _17) ^ ror_s(w, _19) ^ shr_s(w, _10);
112 }
113 
sha256_Sigma0(vec_t w)114 static inline vec_t sha256_Sigma0(vec_t w)
115 {
116     vec_t res, n1 = { 0, 0, 2, 2 }, n2 = { 0, 0, 13, 13 }, n3 = { 0, 0, 10, 10 };
117 
118     touch(n1);
119     res = ror_v(w, n1);
120     touch(n2);
121     res ^= ror_v(w, n2);
122     touch(n3);
123 
124     return res ^ rol_v(w, n3);
125 }
126 
sha256_Sigma1(vec_t w)127 static inline vec_t sha256_Sigma1(vec_t w)
128 {
129     return ror_c(w, 6) ^ ror_c(w, 11) ^ rol_c(w, 7);
130 }
131 
sha_test(void)132 int sha_test(void)
133 {
134     unsigned int i;
135     vec_t src, one = { 1 };
136     vqi_t raw = {};
137 
138     for ( i = 1; i < VEC_SIZE; ++i )
139         raw[i] = i;
140     src = (vec_t)raw;
141 
142     for ( i = 0; i < 256; i += VEC_SIZE )
143     {
144         vec_t x, y, tmp, hash = -src;
145         vec_t a, b, c, d, e, g, h;
146         unsigned int k, r;
147 
148         touch(src);
149         x = SHA(1msg1, hash, src);
150         touch(src);
151         y = hash ^ alignr(hash, src, 8);
152         touch(src);
153 
154         if ( !eq(x, y) ) return __LINE__;
155 
156         touch(src);
157         x = SHA(1msg2, hash, src);
158         touch(src);
159         tmp = hash ^ alignr(src, hash, 12);
160         touch(tmp);
161         y = rol_c(tmp, 1);
162         tmp = hash ^ alignr(src, y, 12);
163         touch(tmp);
164         y = rol_c(tmp, 1);
165 
166         if ( !eq(x, y) ) return __LINE__;
167 
168         touch(src);
169         x = SHA(1msg2, hash, src);
170         touch(src);
171         tmp = rol_s(hash ^ alignr(src, hash, 12), one);
172         y = rol_s(hash ^ alignr(src, tmp, 12), one);
173 
174         if ( !eq(x, y) ) return __LINE__;
175 
176         touch(src);
177         x = SHA(1nexte, hash, src);
178         touch(src);
179         touch(hash);
180         tmp = rol_c(hash, 30);
181         tmp[2] = tmp[1] = tmp[0] = 0;
182 
183         if ( !eq(x, src + tmp) ) return __LINE__;
184 
185         /*
186          * SHA1RNDS4
187          *
188          * SRC1 = { A0, B0, C0, D0 }
189          * SRC2 = W' = { W[0]E0, W[1], W[2], W[3] }
190          *
191          * (NB that the notation is not C-like, i.e. elements are listed
192          * high-to-low everywhere in this comment.)
193          *
194          * In order to pick a simple rounds function, an immediate value of
195          * 1 is used; 3 would also be a possibility.
196          *
197          * Applying
198          *
199          * A1 = ROL5(A0) + (B0 ^ C0 ^ D0) + W'[0] + K
200          * E1 = D0
201          * D1 = C0
202          * C1 = ROL30(B0)
203          * B1 = A0
204          *
205          * iteratively four times and resolving round variable values to
206          * A<n> and B0, C0, and D0 we get
207          *
208          * A4 = ROL5(A3) + (A2 ^ ROL30(A1) ^ ROL30(A0)) + W'[3] + ROL30(B0) + K
209          * A3 = ROL5(A2) + (A1 ^ ROL30(A0) ^ ROL30(B0)) + W'[2] +       C0  + K
210          * A2 = ROL5(A1) + (A0 ^ ROL30(B0) ^       C0 ) + W'[1] +       D0  + K
211          * A1 = ROL5(A0) + (B0 ^       C0  ^       D0 ) + W'[0]             + K
212          *
213          * (respective per-column variable names:
214          *  y         a      b          c           d      src           e    k
215          * )
216          *
217          * with
218          *
219          * B4 = A3
220          * C4 = ROL30(A2)
221          * D4 = ROL30(A1)
222          * E4 = ROL30(A0)
223          *
224          * and hence
225          *
226          * DST = { A4, A3, ROL30(A2), ROL30(A1) }
227          */
228 
229         touch(src);
230         x = SHA(1rnds4, hash, src, 1);
231         touch(src);
232 
233         a = vshr(hash, 3);
234         b = vshr(hash, 2);
235         touch(hash);
236         d = rol_c(hash, 30);
237         touch(hash);
238         d = blend(d, hash, 0b0011);
239         c = vshr(d, 1);
240         e = vshl(d, 1);
241         tmp = (vec_t){};
242         k = rol_c(SHA(1rnds4, tmp, tmp, 1), 2)[0];
243 
244         for ( r = 0; r < 4; ++r )
245         {
246             y = rol_c(a, 5) + (b ^ c ^ d) + swap(src) + e + k;
247 
248             switch ( r )
249             {
250             case 0:
251                 c[3] = rol_c(y, 30)[0];
252                 /* fall through */
253             case 1:
254                 b[r + 2] = y[r];
255                 /* fall through */
256             case 2:
257                 a[r + 1] = y[r];
258                 break;
259             }
260 
261             switch ( r )
262             {
263             case 3:
264                 if ( a[3] != y[2] ) return __LINE__;
265                 /* fall through */
266             case 2:
267                 if ( a[2] != y[1] ) return __LINE__;
268                 if ( b[3] != y[1] ) return __LINE__;
269                 /* fall through */
270             case 1:
271                 if ( a[1] != y[0] ) return __LINE__;
272                 if ( b[2] != y[0] ) return __LINE__;
273                 if ( c[3] != rol_c(y, 30)[0] ) return __LINE__;
274                 break;
275             }
276         }
277 
278         a = blend(rol_c(y, 30), y, 0b1100);
279 
280         if ( !eq(x, a) ) return __LINE__;
281 
282         touch(src);
283         x = SHA(256msg1, hash, src);
284         touch(src);
285         y = hash + sha256_sigma0(alignr(src, hash, 4));
286 
287         if ( !eq(x, y) ) return __LINE__;
288 
289         touch(src);
290         x = SHA(256msg2, hash, src);
291         touch(src);
292         tmp = hash + sha256_sigma1(alignr(hash, src, 8));
293         y = hash + sha256_sigma1(alignr(tmp, src, 8));
294 
295         if ( !eq(x, y) ) return __LINE__;
296 
297         /*
298          * SHA256RNDS2
299          *
300          * SRC1 = { C0, D0, G0, H0 }
301          * SRC2 = { A0, B0, E0, F0 }
302          * XMM0 = W' = { ?, ?, WK1, WK0 }
303          *
304          * (NB that the notation again is not C-like, i.e. elements are listed
305          * high-to-low everywhere in this comment.)
306          *
307          * Ch(E,F,G) = (E & F) ^ (~E & G)
308          * Maj(A,B,C) = (A & B) ^ (A & C) ^ (B & C)
309          *
310          * Σ0(A) = ROR2(A) ^ ROR13(A) ^ ROR22(A)
311          * Σ1(E) = ROR6(E) ^ ROR11(E) ^ ROR25(E)
312          *
313          * Applying
314          *
315          * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0)
316          * B1 = A0
317          * C1 = B0
318          * D1 = C0
319          * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0
320          * F1 = E0
321          * G1 = F0
322          * H1 = G0
323          *
324          * iteratively four times and resolving round variable values to
325          * A<n> / E<n> and B0, C0, D0, F0, G0, and H0 we get
326          *
327          * A2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + Maj(A1, A0, B0) + Σ0(A1)
328          * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0)
329          * E2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + C0
330          * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0
331          *
332          * with
333          *
334          * B2 = A1
335          * F2 = E1
336          *
337          * and hence
338          *
339          * DST = { A2, A1, E2, E1 }
340          *
341          * which we can simplify a little, by letting A0, B0, and E0 be zero
342          * and F0 = ~G0, and by then utilizing
343          *
344          * Ch(0, 0, x) = x
345          * Ch(x, 0, y) = ~x & y
346          * Maj(x, 0, 0) = Maj(0, x, 0) = Maj(0, 0, x) = 0
347          *
348          * A2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + Σ0(A1)
349          * A1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + Σ0(A0)
350          * E2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + C0
351          * E1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + D0
352          *
353          * (respective per-column variable names:
354          *  y      e    g        e    src    h    d
355          * )
356          */
357 
358         tmp = (vec_t){ ~hash[1] };
359         touch(tmp);
360         x = SHA(256rnds2, hash, tmp, src);
361         touch(tmp);
362 
363         e = y = (vec_t){};
364         d = alignr(y, hash, 8);
365         g = (vec_t){ hash[1], tmp[0], hash[1], tmp[0] };
366         h = shuf(hash, 0b01000100);
367 
368         for ( r = 0; r < 2; ++r )
369         {
370             y = (~e & g) + sha256_Sigma1(e) + shuf(src, 0b01000100) +
371                 h + sha256_Sigma0(d);
372 
373             if ( !r )
374             {
375                 d[3] = y[2];
376                 e[3] = e[1] = y[0];
377             }
378             else if ( d[3] != y[2] )
379                 return __LINE__;
380             else if ( e[1] != y[0] )
381                 return __LINE__;
382             else if ( e[3] != y[0] )
383                 return __LINE__;
384         }
385 
386         if ( !eq(x, y) ) return __LINE__;
387 
388         src += 0x01010101 * VEC_SIZE;
389     }
390 
391     return 0;
392 }
393