1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (c) 2020 Linaro Limited
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
7 */
8
9#include <arm64_macros.S>
10#include <asm.S>
11#define CPU_LE(x...)	x
12
13/*
14 * If the lower half of CTR is initialized with zeroes or a low value we
15 * can expect that the upper half will remain unchanged. As an optimization
16 * make the code to increase the upper half optional.
17 */
18#define INC_HALF_CTR	0
19
20	SHASH		.req	v0
21	SHASH2		.req	v1
22	T1		.req	v2
23	T2		.req	v3
24	MASK		.req	v4
25	XL		.req	v5
26	XM		.req	v6
27	XH		.req	v7
28	IN1		.req	v7
29
30	k00_16		.req	v8
31	k32_48		.req	v9
32
33	t3		.req	v10
34	t4		.req	v11
35	t5		.req	v12
36	t6		.req	v13
37	t7		.req	v14
38	t8		.req	v15
39	t9		.req	v16
40
41	perm1		.req	v17
42	perm2		.req	v18
43	perm3		.req	v19
44
45	sh1		.req	v20
46	sh2		.req	v21
47	sh3		.req	v22
48	sh4		.req	v23
49
50	ss1		.req	v24
51	ss2		.req	v25
52	ss3		.req	v26
53	ss4		.req	v27
54
55	XL2		.req	v8
56	XM2		.req	v9
57	XH2		.req	v10
58	XL3		.req	v11
59	XM3		.req	v12
60	XH3		.req	v13
61	TT3		.req	v14
62	TT4		.req	v15
63	HH		.req	v16
64	HH3		.req	v17
65	HH4		.req	v18
66	HH34		.req	v19
67
68	.arch		armv8-a+crypto
69
70	.macro		__pmull_p64, rd, rn, rm
71	pmull		\rd\().1q, \rn\().1d, \rm\().1d
72	.endm
73
74	.macro		__pmull2_p64, rd, rn, rm
75	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
76	.endm
77
78	.macro		__pmull_p8, rq, ad, bd
79	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
80	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
81	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
82
83	__pmull_p8_\bd	\rq, \ad
84	.endm
85
86	.macro		__pmull2_p8, rq, ad, bd
87	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
88	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
89	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
90
91	__pmull2_p8_\bd	\rq, \ad
92	.endm
93
94	.macro		__pmull_p8_SHASH, rq, ad
95	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
96	.endm
97
98	.macro		__pmull_p8_SHASH2, rq, ad
99	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
100	.endm
101
102	.macro		__pmull2_p8_SHASH, rq, ad
103	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
104	.endm
105
106	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
107	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
108	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
109	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
110	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
111	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
112	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
113	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
114	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
115
116	eor		t3.16b, t3.16b, t4.16b			// L = E + F
117	eor		t5.16b, t5.16b, t6.16b			// M = G + H
118	eor		t7.16b, t7.16b, t8.16b			// N = I + J
119
120	uzp1		t4.2d, t3.2d, t5.2d
121	uzp2		t3.2d, t3.2d, t5.2d
122	uzp1		t6.2d, t7.2d, t9.2d
123	uzp2		t7.2d, t7.2d, t9.2d
124
125	// t3 = (L) (P0 + P1) << 8
126	// t5 = (M) (P2 + P3) << 16
127	eor		t4.16b, t4.16b, t3.16b
128	and		t3.16b, t3.16b, k32_48.16b
129
130	// t7 = (N) (P4 + P5) << 24
131	// t9 = (K) (P6 + P7) << 32
132	eor		t6.16b, t6.16b, t7.16b
133	and		t7.16b, t7.16b, k00_16.16b
134
135	eor		t4.16b, t4.16b, t3.16b
136	eor		t6.16b, t6.16b, t7.16b
137
138	zip2		t5.2d, t4.2d, t3.2d
139	zip1		t3.2d, t4.2d, t3.2d
140	zip2		t9.2d, t6.2d, t7.2d
141	zip1		t7.2d, t6.2d, t7.2d
142
143	ext		t3.16b, t3.16b, t3.16b, #15
144	ext		t5.16b, t5.16b, t5.16b, #14
145	ext		t7.16b, t7.16b, t7.16b, #13
146	ext		t9.16b, t9.16b, t9.16b, #12
147
148	eor		t3.16b, t3.16b, t5.16b
149	eor		t7.16b, t7.16b, t9.16b
150	eor		\rq\().16b, \rq\().16b, t3.16b
151	eor		\rq\().16b, \rq\().16b, t7.16b
152	.endm
153
154	.macro		__pmull_pre_p64
155	add		x8, x3, #16
156	ld1		{HH.2d-HH4.2d}, [x8]
157
158	trn1		SHASH2.2d, SHASH.2d, HH.2d
159	trn2		T1.2d, SHASH.2d, HH.2d
160	eor		SHASH2.16b, SHASH2.16b, T1.16b
161
162	trn1		HH34.2d, HH3.2d, HH4.2d
163	trn2		T1.2d, HH3.2d, HH4.2d
164	eor		HH34.16b, HH34.16b, T1.16b
165
166	movi		MASK.16b, #0xe1
167	shl		MASK.2d, MASK.2d, #57
168	.endm
169
170	.macro		__pmull_pre_p8
171	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
172	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
173
174	// k00_16 := 0x0000000000000000_000000000000ffff
175	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
176	movi		k32_48.2d, #0xffffffff
177	mov		k32_48.h[2], k32_48.h[0]
178	ushr		k00_16.2d, k32_48.2d, #32
179
180	// prepare the permutation vectors
181	mov_imm		x5, 0x080f0e0d0c0b0a09
182	movi		T1.8b, #8
183	dup		perm1.2d, x5
184	eor		perm1.16b, perm1.16b, T1.16b
185	ushr		perm2.2d, perm1.2d, #8
186	ushr		perm3.2d, perm1.2d, #16
187	ushr		T1.2d, perm1.2d, #24
188	sli		perm2.2d, perm1.2d, #56
189	sli		perm3.2d, perm1.2d, #48
190	sli		T1.2d, perm1.2d, #40
191
192	// precompute loop invariants
193	tbl		sh1.16b, {SHASH.16b}, perm1.16b
194	tbl		sh2.16b, {SHASH.16b}, perm2.16b
195	tbl		sh3.16b, {SHASH.16b}, perm3.16b
196	tbl		sh4.16b, {SHASH.16b}, T1.16b
197	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
198	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
199	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
200	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
201	.endm
202
203	//
204	// PMULL (64x64->128) based reduction for CPUs that can do
205	// it in a single instruction.
206	//
207	.macro		__pmull_reduce_p64
208	pmull		T2.1q, XL.1d, MASK.1d
209	eor		XM.16b, XM.16b, T1.16b
210
211	mov		XH.d[0], XM.d[1]
212	mov		XM.d[1], XL.d[0]
213
214	eor		XL.16b, XM.16b, T2.16b
215	ext		T2.16b, XL.16b, XL.16b, #8
216	pmull		XL.1q, XL.1d, MASK.1d
217	.endm
218
219	//
220	// Alternative reduction for CPUs that lack support for the
221	// 64x64->128 PMULL instruction
222	//
223	.macro		__pmull_reduce_p8
224	eor		XM.16b, XM.16b, T1.16b
225
226	mov		XL.d[1], XM.d[0]
227	mov		XH.d[0], XM.d[1]
228
229	shl		T1.2d, XL.2d, #57
230	shl		T2.2d, XL.2d, #62
231	eor		T2.16b, T2.16b, T1.16b
232	shl		T1.2d, XL.2d, #63
233	eor		T2.16b, T2.16b, T1.16b
234	ext		T1.16b, XL.16b, XH.16b, #8
235	eor		T2.16b, T2.16b, T1.16b
236
237	mov		XL.d[1], T2.d[0]
238	mov		XH.d[0], T2.d[1]
239
240	ushr		T2.2d, XL.2d, #1
241	eor		XH.16b, XH.16b, XL.16b
242	eor		XL.16b, XL.16b, T2.16b
243	ushr		T2.2d, T2.2d, #6
244	ushr		XL.2d, XL.2d, #1
245	.endm
246
247	.macro		__pmull_ghash, pn
248	ld1		{SHASH.2d}, [x3]
249	ld1		{XL.2d}, [x1]
250
251	__pmull_pre_\pn
252
253	/* do the head block first, if supplied */
254	cbz		x4, 0f
255	ld1		{T1.16b}, [x4]
256	mov		x4, xzr
257	b		3f
258
2590:	.ifc		\pn, p64
260	tbnz		w0, #0, 2f		// skip until #blocks is a
261	tbnz		w0, #1, 2f		// round multiple of 4
262
2631:	ld1		{XM3.16b-TT4.16b}, [x2], #64
264
265	sub		w0, w0, #4
266
267	rev64		T1.16b, XM3.16b
268	rev64		T2.16b, XH3.16b
269	rev64		TT4.16b, TT4.16b
270	rev64		TT3.16b, TT3.16b
271
272	ext		IN1.16b, TT4.16b, TT4.16b, #8
273	ext		XL3.16b, TT3.16b, TT3.16b, #8
274
275	eor		TT4.16b, TT4.16b, IN1.16b
276	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
277	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
278	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
279
280	eor		TT3.16b, TT3.16b, XL3.16b
281	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
282	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
283	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
284
285	ext		IN1.16b, T2.16b, T2.16b, #8
286	eor		XL2.16b, XL2.16b, XL3.16b
287	eor		XH2.16b, XH2.16b, XH3.16b
288	eor		XM2.16b, XM2.16b, XM3.16b
289
290	eor		T2.16b, T2.16b, IN1.16b
291	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
292	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
293	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
294
295	eor		XL2.16b, XL2.16b, XL3.16b
296	eor		XH2.16b, XH2.16b, XH3.16b
297	eor		XM2.16b, XM2.16b, XM3.16b
298
299	ext		IN1.16b, T1.16b, T1.16b, #8
300	ext		TT3.16b, XL.16b, XL.16b, #8
301	eor		XL.16b, XL.16b, IN1.16b
302	eor		T1.16b, T1.16b, TT3.16b
303
304	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
305	eor		T1.16b, T1.16b, XL.16b
306	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
307	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
308
309	eor		XL.16b, XL.16b, XL2.16b
310	eor		XH.16b, XH.16b, XH2.16b
311	eor		XM.16b, XM.16b, XM2.16b
312
313	eor		T2.16b, XL.16b, XH.16b
314	ext		T1.16b, XL.16b, XH.16b, #8
315	eor		XM.16b, XM.16b, T2.16b
316
317	__pmull_reduce_p64
318
319	eor		T2.16b, T2.16b, XH.16b
320	eor		XL.16b, XL.16b, T2.16b
321
322	cbz		w0, 5f
323	b		1b
324	.endif
325
3262:	ld1		{T1.16b}, [x2], #16
327	sub		w0, w0, #1
328
3293:	/* multiply XL by SHASH in GF(2^128) */
330CPU_LE(	rev64		T1.16b, T1.16b	)
331
332	ext		T2.16b, XL.16b, XL.16b, #8
333	ext		IN1.16b, T1.16b, T1.16b, #8
334	eor		T1.16b, T1.16b, T2.16b
335	eor		XL.16b, XL.16b, IN1.16b
336
337	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
338	eor		T1.16b, T1.16b, XL.16b
339	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
340	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
341
3424:	eor		T2.16b, XL.16b, XH.16b
343	ext		T1.16b, XL.16b, XH.16b, #8
344	eor		XM.16b, XM.16b, T2.16b
345
346	__pmull_reduce_\pn
347
348	eor		T2.16b, T2.16b, XH.16b
349	eor		XL.16b, XL.16b, T2.16b
350
351	cbnz		w0, 0b
352
3535:	st1		{XL.2d}, [x1]
354	ret
355	.endm
356
357/*
358 * void pmull_ghash_update_p64(int blocks, uint64_t dg[2], const uint8_t *src,
359 *			       const struct internal_ghash_key *ghash_key,
360 *			       const uint8_t *head);
361 */
362FUNC pmull_ghash_update_p64 , :
363	__pmull_ghash	p64
364END_FUNC pmull_ghash_update_p64
365
366/*
367 * void pmull_ghash_update_p8(int blocks, uint64_t dg[2], const uint8_t *src,
368 *			      const struct internal_ghash_key *ghash_key,
369 *			      const uint8_t *head);
370 */
371FUNC pmull_ghash_update_p8 , :
372	__pmull_ghash	p8
373END_FUNC pmull_ghash_update_p8
374
375	KS0		.req	v12
376	KS1		.req	v13
377	INP0		.req	v14
378	INP1		.req	v15
379
380	.macro		load_round_keys, rounds, rk
381	cmp		\rounds, #12
382	blo		2222f		/* 128 bits */
383	beq		1111f		/* 192 bits */
384	ld1		{v17.4s-v18.4s}, [\rk], #32
3851111:	ld1		{v19.4s-v20.4s}, [\rk], #32
3862222:	ld1		{v21.4s-v24.4s}, [\rk], #64
387	ld1		{v25.4s-v28.4s}, [\rk], #64
388	ld1		{v29.4s-v31.4s}, [\rk]
389	.endm
390
391	.macro		enc_round, state, key
392	aese		\state\().16b, \key\().16b
393	aesmc		\state\().16b, \state\().16b
394	.endm
395
396	.macro		enc_block, state, rounds
397	cmp		\rounds, #12
398	b.lo		2222f		/* 128 bits */
399	b.eq		1111f		/* 192 bits */
400	enc_round	\state, v17
401	enc_round	\state, v18
4021111:	enc_round	\state, v19
403	enc_round	\state, v20
4042222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
405	enc_round	\state, \key
406	.endr
407	aese		\state\().16b, v30.16b
408	eor		\state\().16b, \state\().16b, v31.16b
409	.endm
410
411	.macro		pmull_gcm_do_crypt, enc
412	ld1		{SHASH.2d}, [x4], #16
413	ld1		{HH.2d}, [x4]
414	ld1		{XL.2d}, [x1]
415#if INC_HALF_CTR
416	ldr		x8, [x5, #8]			// load lower counter
417#else
418	ldp		x9, x8, [x5]			// load counter
419#endif
420
421	movi		MASK.16b, #0xe1
422	trn1		SHASH2.2d, SHASH.2d, HH.2d
423	trn2		T1.2d, SHASH.2d, HH.2d
424CPU_LE(	rev		x8, x8		)
425#if !INC_HALF_CTR
426CPU_LE(	rev		x9, x9		)
427#endif
428	shl		MASK.2d, MASK.2d, #57
429	eor		SHASH2.16b, SHASH2.16b, T1.16b
430
431	.if		\enc == 1
432	ldr		x10, [sp]
433	ld1		{KS0.16b-KS1.16b}, [x10]
434	.endif
435
436	cbnz		x6, 4f
437
4380:	ld1		{INP0.16b-INP1.16b}, [x3], #32
439
440#if INC_HALF_CTR
441	rev		x9, x8
442	add		x11, x8, #1
443	add		x8, x8, #2
444#endif
445
446	.if		\enc == 1
447	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
448	eor		INP1.16b, INP1.16b, KS1.16b
449	.endif
450
451	sub		w0, w0, #2
452
453#if INC_HALF_CTR
454	ld1		{KS0.8b}, [x5]			// load upper counter
455	rev		x11, x11
456	mov		KS1.8b, KS0.8b
457	ins		KS0.d[1], x9			// set lower counter
458	ins		KS1.d[1], x11
459#else
460	ins		KS0.d[1], x8
461	ins		KS0.d[0], x9
462	rev64		KS0.16b, KS0.16b
463
464	add		x8, x8, #1
465	cbnz		x8, 10f
466	add		x9, x9, #1
46710:
468	ins		KS1.d[1], x8
469	ins		KS1.d[0], x9
470	rev64		KS1.16b, KS1.16b
471
472	add		x8, x8, #1
473	cbnz		x8, 11f
474	add		x9, x9, #1
47511:
476#endif
477
478	rev64		T1.16b, INP1.16b
479
480	cmp		w7, #12
481	b.ge		2f				// AES-192/256?
482
4831:	enc_round	KS0, v21
484	ext		IN1.16b, T1.16b, T1.16b, #8
485
486	enc_round	KS1, v21
487	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
488
489	enc_round	KS0, v22
490	eor		T1.16b, T1.16b, IN1.16b
491
492	enc_round	KS1, v22
493	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
494
495	enc_round	KS0, v23
496	pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0)
497
498	enc_round	KS1, v23
499	rev64		T1.16b, INP0.16b
500	ext		T2.16b, XL.16b, XL.16b, #8
501
502	enc_round	KS0, v24
503	ext		IN1.16b, T1.16b, T1.16b, #8
504	eor		T1.16b, T1.16b, T2.16b
505
506	enc_round	KS1, v24
507	eor		XL.16b, XL.16b, IN1.16b
508
509	enc_round	KS0, v25
510	eor		T1.16b, T1.16b, XL.16b
511
512	enc_round	KS1, v25
513	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
514
515	enc_round	KS0, v26
516	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
517
518	enc_round	KS1, v26
519	pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0)
520
521	enc_round	KS0, v27
522	eor		XL.16b, XL.16b, XL2.16b
523	eor		XH.16b, XH.16b, XH2.16b
524
525	enc_round	KS1, v27
526	eor		XM.16b, XM.16b, XM2.16b
527	ext		T1.16b, XL.16b, XH.16b, #8
528
529	enc_round	KS0, v28
530	eor		T2.16b, XL.16b, XH.16b
531	eor		XM.16b, XM.16b, T1.16b
532
533	enc_round	KS1, v28
534	eor		XM.16b, XM.16b, T2.16b
535
536	enc_round	KS0, v29
537	pmull		T2.1q, XL.1d, MASK.1d
538
539	enc_round	KS1, v29
540	mov		XH.d[0], XM.d[1]
541	mov		XM.d[1], XL.d[0]
542
543	aese		KS0.16b, v30.16b
544	eor		XL.16b, XM.16b, T2.16b
545
546	aese		KS1.16b, v30.16b
547	ext		T2.16b, XL.16b, XL.16b, #8
548
549	eor		KS0.16b, KS0.16b, v31.16b
550	pmull		XL.1q, XL.1d, MASK.1d
551	eor		T2.16b, T2.16b, XH.16b
552
553	eor		KS1.16b, KS1.16b, v31.16b
554	eor		XL.16b, XL.16b, T2.16b
555
556	.if		\enc == 0
557	eor		INP0.16b, INP0.16b, KS0.16b
558	eor		INP1.16b, INP1.16b, KS1.16b
559	.endif
560
561	st1		{INP0.16b-INP1.16b}, [x2], #32
562
563	cbnz		w0, 0b
564
565CPU_LE(	rev		x8, x8		)
566#if !INC_HALF_CTR
567CPU_LE(	rev		x9, x9		)
568#endif
569	st1		{XL.2d}, [x1]
570#if INC_HALF_CTR
571	str		x8, [x5, #8]			// store lower counter
572#else
573	stp		x9, x8, [x5]			// store counter
574#endif
575
576	.if		\enc == 1
577	st1		{KS0.16b-KS1.16b}, [x10]
578	.endif
579
580	ret
581
5822:	b.eq		3f				// AES-192?
583	enc_round	KS0, v17
584	enc_round	KS1, v17
585	enc_round	KS0, v18
586	enc_round	KS1, v18
5873:	enc_round	KS0, v19
588	enc_round	KS1, v19
589	enc_round	KS0, v20
590	enc_round	KS1, v20
591	b		1b
592
5934:	load_round_keys	w7, x6
594	b		0b
595	.endm
596
597/*
598 * void pmull_gcm_encrypt(int blocks, uint64_t dg[2], uint8_t dst[],
599 *			  const uint8_t src[],
600 *			  const struct internal_ghash_key *ghash_key,
601 *			  uint64_t ctr[], const uint64_t rk[], int rounds,
602 *			  uint8_t ks[]);
603 */
604FUNC pmull_gcm_encrypt , :
605	pmull_gcm_do_crypt	1
606END_FUNC pmull_gcm_encrypt
607
608/*
609 * void pmull_gcm_decrypt(int blocks, uint64_t dg[2], uint8_t dst[],
610 *			  const uint8_t src[],
611 *			  const struct internal_ghash_key *ghash_key,
612 *			  uint64_t ctr[], const uint64_t rk[], int rounds);
613 */
614FUNC pmull_gcm_decrypt , :
615	pmull_gcm_do_crypt	0
616END_FUNC pmull_gcm_decrypt
617
618/*
619 * void pmull_gcm_encrypt_block(uint8_t dst[], const uint8_t src[], int rounds)
620 */
621FUNC pmull_gcm_encrypt_block , :
622	ld1		{v0.16b}, [x1]
623	enc_block	v0, w2
624	st1		{v0.16b}, [x0]
625	ret
626END_FUNC pmull_gcm_encrypt_block
627
628/*
629 * void pmull_gcm_load_round_keys(const uint64_t rk[30], int rounds)
630 */
631FUNC pmull_gcm_load_round_keys , :
632	load_round_keys	w1, x0
633	ret
634END_FUNC pmull_gcm_load_round_keys
635
636/*
637 * uint32_t pmull_gcm_aes_sub(uint32_t input)
638 *
639 * use the aese instruction to perform the AES sbox substitution
640 * on each byte in 'input'
641 */
642FUNC pmull_gcm_aes_sub , :
643	dup	v1.4s, w0
644	movi	v0.16b, #0
645	aese	v0.16b, v1.16b
646	umov	w0, v0.s[0]
647	ret
648END_FUNC pmull_gcm_aes_sub
649
650BTI(emit_aarch64_feature_1_and     GNU_PROPERTY_AARCH64_FEATURE_1_BTI)
651