1/* SPDX-License-Identifier: BSD-2-Clause */ 2/* 3 * Copyright (c) 2020 Linaro Limited 4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 7 */ 8 9#include <arm64_macros.S> 10#include <asm.S> 11#define CPU_LE(x...) x 12 13/* 14 * If the lower half of CTR is initialized with zeroes or a low value we 15 * can expect that the upper half will remain unchanged. As an optimization 16 * make the code to increase the upper half optional. 17 */ 18#define INC_HALF_CTR 0 19 20 SHASH .req v0 21 SHASH2 .req v1 22 T1 .req v2 23 T2 .req v3 24 MASK .req v4 25 XL .req v5 26 XM .req v6 27 XH .req v7 28 IN1 .req v7 29 30 k00_16 .req v8 31 k32_48 .req v9 32 33 t3 .req v10 34 t4 .req v11 35 t5 .req v12 36 t6 .req v13 37 t7 .req v14 38 t8 .req v15 39 t9 .req v16 40 41 perm1 .req v17 42 perm2 .req v18 43 perm3 .req v19 44 45 sh1 .req v20 46 sh2 .req v21 47 sh3 .req v22 48 sh4 .req v23 49 50 ss1 .req v24 51 ss2 .req v25 52 ss3 .req v26 53 ss4 .req v27 54 55 XL2 .req v8 56 XM2 .req v9 57 XH2 .req v10 58 XL3 .req v11 59 XM3 .req v12 60 XH3 .req v13 61 TT3 .req v14 62 TT4 .req v15 63 HH .req v16 64 HH3 .req v17 65 HH4 .req v18 66 HH34 .req v19 67 68 .arch armv8-a+crypto 69 70 .macro __pmull_p64, rd, rn, rm 71 pmull \rd\().1q, \rn\().1d, \rm\().1d 72 .endm 73 74 .macro __pmull2_p64, rd, rn, rm 75 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 76 .endm 77 78 .macro __pmull_p8, rq, ad, bd 79 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 80 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 81 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 82 83 __pmull_p8_\bd \rq, \ad 84 .endm 85 86 .macro __pmull2_p8, rq, ad, bd 87 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 88 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 89 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 90 91 __pmull2_p8_\bd \rq, \ad 92 .endm 93 94 .macro __pmull_p8_SHASH, rq, ad 95 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 96 .endm 97 98 .macro __pmull_p8_SHASH2, rq, ad 99 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 100 .endm 101 102 .macro __pmull2_p8_SHASH, rq, ad 103 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 104 .endm 105 106 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 107 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 108 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 109 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 110 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 111 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 112 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 113 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 114 pmull\t \rq\().8h, \ad, \bd // D = A*B 115 116 eor t3.16b, t3.16b, t4.16b // L = E + F 117 eor t5.16b, t5.16b, t6.16b // M = G + H 118 eor t7.16b, t7.16b, t8.16b // N = I + J 119 120 uzp1 t4.2d, t3.2d, t5.2d 121 uzp2 t3.2d, t3.2d, t5.2d 122 uzp1 t6.2d, t7.2d, t9.2d 123 uzp2 t7.2d, t7.2d, t9.2d 124 125 // t3 = (L) (P0 + P1) << 8 126 // t5 = (M) (P2 + P3) << 16 127 eor t4.16b, t4.16b, t3.16b 128 and t3.16b, t3.16b, k32_48.16b 129 130 // t7 = (N) (P4 + P5) << 24 131 // t9 = (K) (P6 + P7) << 32 132 eor t6.16b, t6.16b, t7.16b 133 and t7.16b, t7.16b, k00_16.16b 134 135 eor t4.16b, t4.16b, t3.16b 136 eor t6.16b, t6.16b, t7.16b 137 138 zip2 t5.2d, t4.2d, t3.2d 139 zip1 t3.2d, t4.2d, t3.2d 140 zip2 t9.2d, t6.2d, t7.2d 141 zip1 t7.2d, t6.2d, t7.2d 142 143 ext t3.16b, t3.16b, t3.16b, #15 144 ext t5.16b, t5.16b, t5.16b, #14 145 ext t7.16b, t7.16b, t7.16b, #13 146 ext t9.16b, t9.16b, t9.16b, #12 147 148 eor t3.16b, t3.16b, t5.16b 149 eor t7.16b, t7.16b, t9.16b 150 eor \rq\().16b, \rq\().16b, t3.16b 151 eor \rq\().16b, \rq\().16b, t7.16b 152 .endm 153 154 .macro __pmull_pre_p64 155 add x8, x3, #16 156 ld1 {HH.2d-HH4.2d}, [x8] 157 158 trn1 SHASH2.2d, SHASH.2d, HH.2d 159 trn2 T1.2d, SHASH.2d, HH.2d 160 eor SHASH2.16b, SHASH2.16b, T1.16b 161 162 trn1 HH34.2d, HH3.2d, HH4.2d 163 trn2 T1.2d, HH3.2d, HH4.2d 164 eor HH34.16b, HH34.16b, T1.16b 165 166 movi MASK.16b, #0xe1 167 shl MASK.2d, MASK.2d, #57 168 .endm 169 170 .macro __pmull_pre_p8 171 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 172 eor SHASH2.16b, SHASH2.16b, SHASH.16b 173 174 // k00_16 := 0x0000000000000000_000000000000ffff 175 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 176 movi k32_48.2d, #0xffffffff 177 mov k32_48.h[2], k32_48.h[0] 178 ushr k00_16.2d, k32_48.2d, #32 179 180 // prepare the permutation vectors 181 mov_imm x5, 0x080f0e0d0c0b0a09 182 movi T1.8b, #8 183 dup perm1.2d, x5 184 eor perm1.16b, perm1.16b, T1.16b 185 ushr perm2.2d, perm1.2d, #8 186 ushr perm3.2d, perm1.2d, #16 187 ushr T1.2d, perm1.2d, #24 188 sli perm2.2d, perm1.2d, #56 189 sli perm3.2d, perm1.2d, #48 190 sli T1.2d, perm1.2d, #40 191 192 // precompute loop invariants 193 tbl sh1.16b, {SHASH.16b}, perm1.16b 194 tbl sh2.16b, {SHASH.16b}, perm2.16b 195 tbl sh3.16b, {SHASH.16b}, perm3.16b 196 tbl sh4.16b, {SHASH.16b}, T1.16b 197 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 198 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 199 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 200 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 201 .endm 202 203 // 204 // PMULL (64x64->128) based reduction for CPUs that can do 205 // it in a single instruction. 206 // 207 .macro __pmull_reduce_p64 208 pmull T2.1q, XL.1d, MASK.1d 209 eor XM.16b, XM.16b, T1.16b 210 211 mov XH.d[0], XM.d[1] 212 mov XM.d[1], XL.d[0] 213 214 eor XL.16b, XM.16b, T2.16b 215 ext T2.16b, XL.16b, XL.16b, #8 216 pmull XL.1q, XL.1d, MASK.1d 217 .endm 218 219 // 220 // Alternative reduction for CPUs that lack support for the 221 // 64x64->128 PMULL instruction 222 // 223 .macro __pmull_reduce_p8 224 eor XM.16b, XM.16b, T1.16b 225 226 mov XL.d[1], XM.d[0] 227 mov XH.d[0], XM.d[1] 228 229 shl T1.2d, XL.2d, #57 230 shl T2.2d, XL.2d, #62 231 eor T2.16b, T2.16b, T1.16b 232 shl T1.2d, XL.2d, #63 233 eor T2.16b, T2.16b, T1.16b 234 ext T1.16b, XL.16b, XH.16b, #8 235 eor T2.16b, T2.16b, T1.16b 236 237 mov XL.d[1], T2.d[0] 238 mov XH.d[0], T2.d[1] 239 240 ushr T2.2d, XL.2d, #1 241 eor XH.16b, XH.16b, XL.16b 242 eor XL.16b, XL.16b, T2.16b 243 ushr T2.2d, T2.2d, #6 244 ushr XL.2d, XL.2d, #1 245 .endm 246 247 .macro __pmull_ghash, pn 248 ld1 {SHASH.2d}, [x3] 249 ld1 {XL.2d}, [x1] 250 251 __pmull_pre_\pn 252 253 /* do the head block first, if supplied */ 254 cbz x4, 0f 255 ld1 {T1.16b}, [x4] 256 mov x4, xzr 257 b 3f 258 2590: .ifc \pn, p64 260 tbnz w0, #0, 2f // skip until #blocks is a 261 tbnz w0, #1, 2f // round multiple of 4 262 2631: ld1 {XM3.16b-TT4.16b}, [x2], #64 264 265 sub w0, w0, #4 266 267 rev64 T1.16b, XM3.16b 268 rev64 T2.16b, XH3.16b 269 rev64 TT4.16b, TT4.16b 270 rev64 TT3.16b, TT3.16b 271 272 ext IN1.16b, TT4.16b, TT4.16b, #8 273 ext XL3.16b, TT3.16b, TT3.16b, #8 274 275 eor TT4.16b, TT4.16b, IN1.16b 276 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 277 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 278 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 279 280 eor TT3.16b, TT3.16b, XL3.16b 281 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 282 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 283 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 284 285 ext IN1.16b, T2.16b, T2.16b, #8 286 eor XL2.16b, XL2.16b, XL3.16b 287 eor XH2.16b, XH2.16b, XH3.16b 288 eor XM2.16b, XM2.16b, XM3.16b 289 290 eor T2.16b, T2.16b, IN1.16b 291 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 292 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 293 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 294 295 eor XL2.16b, XL2.16b, XL3.16b 296 eor XH2.16b, XH2.16b, XH3.16b 297 eor XM2.16b, XM2.16b, XM3.16b 298 299 ext IN1.16b, T1.16b, T1.16b, #8 300 ext TT3.16b, XL.16b, XL.16b, #8 301 eor XL.16b, XL.16b, IN1.16b 302 eor T1.16b, T1.16b, TT3.16b 303 304 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 305 eor T1.16b, T1.16b, XL.16b 306 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 307 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 308 309 eor XL.16b, XL.16b, XL2.16b 310 eor XH.16b, XH.16b, XH2.16b 311 eor XM.16b, XM.16b, XM2.16b 312 313 eor T2.16b, XL.16b, XH.16b 314 ext T1.16b, XL.16b, XH.16b, #8 315 eor XM.16b, XM.16b, T2.16b 316 317 __pmull_reduce_p64 318 319 eor T2.16b, T2.16b, XH.16b 320 eor XL.16b, XL.16b, T2.16b 321 322 cbz w0, 5f 323 b 1b 324 .endif 325 3262: ld1 {T1.16b}, [x2], #16 327 sub w0, w0, #1 328 3293: /* multiply XL by SHASH in GF(2^128) */ 330CPU_LE( rev64 T1.16b, T1.16b ) 331 332 ext T2.16b, XL.16b, XL.16b, #8 333 ext IN1.16b, T1.16b, T1.16b, #8 334 eor T1.16b, T1.16b, T2.16b 335 eor XL.16b, XL.16b, IN1.16b 336 337 __pmull2_\pn XH, XL, SHASH // a1 * b1 338 eor T1.16b, T1.16b, XL.16b 339 __pmull_\pn XL, XL, SHASH // a0 * b0 340 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 341 3424: eor T2.16b, XL.16b, XH.16b 343 ext T1.16b, XL.16b, XH.16b, #8 344 eor XM.16b, XM.16b, T2.16b 345 346 __pmull_reduce_\pn 347 348 eor T2.16b, T2.16b, XH.16b 349 eor XL.16b, XL.16b, T2.16b 350 351 cbnz w0, 0b 352 3535: st1 {XL.2d}, [x1] 354 ret 355 .endm 356 357/* 358 * void pmull_ghash_update_p64(int blocks, uint64_t dg[2], const uint8_t *src, 359 * const struct internal_ghash_key *ghash_key, 360 * const uint8_t *head); 361 */ 362FUNC pmull_ghash_update_p64 , : 363 __pmull_ghash p64 364END_FUNC pmull_ghash_update_p64 365 366/* 367 * void pmull_ghash_update_p8(int blocks, uint64_t dg[2], const uint8_t *src, 368 * const struct internal_ghash_key *ghash_key, 369 * const uint8_t *head); 370 */ 371FUNC pmull_ghash_update_p8 , : 372 __pmull_ghash p8 373END_FUNC pmull_ghash_update_p8 374 375 KS0 .req v12 376 KS1 .req v13 377 INP0 .req v14 378 INP1 .req v15 379 380 .macro load_round_keys, rounds, rk 381 cmp \rounds, #12 382 blo 2222f /* 128 bits */ 383 beq 1111f /* 192 bits */ 384 ld1 {v17.4s-v18.4s}, [\rk], #32 3851111: ld1 {v19.4s-v20.4s}, [\rk], #32 3862222: ld1 {v21.4s-v24.4s}, [\rk], #64 387 ld1 {v25.4s-v28.4s}, [\rk], #64 388 ld1 {v29.4s-v31.4s}, [\rk] 389 .endm 390 391 .macro enc_round, state, key 392 aese \state\().16b, \key\().16b 393 aesmc \state\().16b, \state\().16b 394 .endm 395 396 .macro enc_block, state, rounds 397 cmp \rounds, #12 398 b.lo 2222f /* 128 bits */ 399 b.eq 1111f /* 192 bits */ 400 enc_round \state, v17 401 enc_round \state, v18 4021111: enc_round \state, v19 403 enc_round \state, v20 4042222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 405 enc_round \state, \key 406 .endr 407 aese \state\().16b, v30.16b 408 eor \state\().16b, \state\().16b, v31.16b 409 .endm 410 411 .macro pmull_gcm_do_crypt, enc 412 ld1 {SHASH.2d}, [x4], #16 413 ld1 {HH.2d}, [x4] 414 ld1 {XL.2d}, [x1] 415#if INC_HALF_CTR 416 ldr x8, [x5, #8] // load lower counter 417#else 418 ldp x9, x8, [x5] // load counter 419#endif 420 421 movi MASK.16b, #0xe1 422 trn1 SHASH2.2d, SHASH.2d, HH.2d 423 trn2 T1.2d, SHASH.2d, HH.2d 424CPU_LE( rev x8, x8 ) 425#if !INC_HALF_CTR 426CPU_LE( rev x9, x9 ) 427#endif 428 shl MASK.2d, MASK.2d, #57 429 eor SHASH2.16b, SHASH2.16b, T1.16b 430 431 .if \enc == 1 432 ldr x10, [sp] 433 ld1 {KS0.16b-KS1.16b}, [x10] 434 .endif 435 436 cbnz x6, 4f 437 4380: ld1 {INP0.16b-INP1.16b}, [x3], #32 439 440#if INC_HALF_CTR 441 rev x9, x8 442 add x11, x8, #1 443 add x8, x8, #2 444#endif 445 446 .if \enc == 1 447 eor INP0.16b, INP0.16b, KS0.16b // encrypt input 448 eor INP1.16b, INP1.16b, KS1.16b 449 .endif 450 451 sub w0, w0, #2 452 453#if INC_HALF_CTR 454 ld1 {KS0.8b}, [x5] // load upper counter 455 rev x11, x11 456 mov KS1.8b, KS0.8b 457 ins KS0.d[1], x9 // set lower counter 458 ins KS1.d[1], x11 459#else 460 ins KS0.d[1], x8 461 ins KS0.d[0], x9 462 rev64 KS0.16b, KS0.16b 463 464 add x8, x8, #1 465 cbnz x8, 10f 466 add x9, x9, #1 46710: 468 ins KS1.d[1], x8 469 ins KS1.d[0], x9 470 rev64 KS1.16b, KS1.16b 471 472 add x8, x8, #1 473 cbnz x8, 11f 474 add x9, x9, #1 47511: 476#endif 477 478 rev64 T1.16b, INP1.16b 479 480 cmp w7, #12 481 b.ge 2f // AES-192/256? 482 4831: enc_round KS0, v21 484 ext IN1.16b, T1.16b, T1.16b, #8 485 486 enc_round KS1, v21 487 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 488 489 enc_round KS0, v22 490 eor T1.16b, T1.16b, IN1.16b 491 492 enc_round KS1, v22 493 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 494 495 enc_round KS0, v23 496 pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 497 498 enc_round KS1, v23 499 rev64 T1.16b, INP0.16b 500 ext T2.16b, XL.16b, XL.16b, #8 501 502 enc_round KS0, v24 503 ext IN1.16b, T1.16b, T1.16b, #8 504 eor T1.16b, T1.16b, T2.16b 505 506 enc_round KS1, v24 507 eor XL.16b, XL.16b, IN1.16b 508 509 enc_round KS0, v25 510 eor T1.16b, T1.16b, XL.16b 511 512 enc_round KS1, v25 513 pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 514 515 enc_round KS0, v26 516 pmull XL.1q, HH.1d, XL.1d // a0 * b0 517 518 enc_round KS1, v26 519 pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) 520 521 enc_round KS0, v27 522 eor XL.16b, XL.16b, XL2.16b 523 eor XH.16b, XH.16b, XH2.16b 524 525 enc_round KS1, v27 526 eor XM.16b, XM.16b, XM2.16b 527 ext T1.16b, XL.16b, XH.16b, #8 528 529 enc_round KS0, v28 530 eor T2.16b, XL.16b, XH.16b 531 eor XM.16b, XM.16b, T1.16b 532 533 enc_round KS1, v28 534 eor XM.16b, XM.16b, T2.16b 535 536 enc_round KS0, v29 537 pmull T2.1q, XL.1d, MASK.1d 538 539 enc_round KS1, v29 540 mov XH.d[0], XM.d[1] 541 mov XM.d[1], XL.d[0] 542 543 aese KS0.16b, v30.16b 544 eor XL.16b, XM.16b, T2.16b 545 546 aese KS1.16b, v30.16b 547 ext T2.16b, XL.16b, XL.16b, #8 548 549 eor KS0.16b, KS0.16b, v31.16b 550 pmull XL.1q, XL.1d, MASK.1d 551 eor T2.16b, T2.16b, XH.16b 552 553 eor KS1.16b, KS1.16b, v31.16b 554 eor XL.16b, XL.16b, T2.16b 555 556 .if \enc == 0 557 eor INP0.16b, INP0.16b, KS0.16b 558 eor INP1.16b, INP1.16b, KS1.16b 559 .endif 560 561 st1 {INP0.16b-INP1.16b}, [x2], #32 562 563 cbnz w0, 0b 564 565CPU_LE( rev x8, x8 ) 566#if !INC_HALF_CTR 567CPU_LE( rev x9, x9 ) 568#endif 569 st1 {XL.2d}, [x1] 570#if INC_HALF_CTR 571 str x8, [x5, #8] // store lower counter 572#else 573 stp x9, x8, [x5] // store counter 574#endif 575 576 .if \enc == 1 577 st1 {KS0.16b-KS1.16b}, [x10] 578 .endif 579 580 ret 581 5822: b.eq 3f // AES-192? 583 enc_round KS0, v17 584 enc_round KS1, v17 585 enc_round KS0, v18 586 enc_round KS1, v18 5873: enc_round KS0, v19 588 enc_round KS1, v19 589 enc_round KS0, v20 590 enc_round KS1, v20 591 b 1b 592 5934: load_round_keys w7, x6 594 b 0b 595 .endm 596 597/* 598 * void pmull_gcm_encrypt(int blocks, uint64_t dg[2], uint8_t dst[], 599 * const uint8_t src[], 600 * const struct internal_ghash_key *ghash_key, 601 * uint64_t ctr[], const uint64_t rk[], int rounds, 602 * uint8_t ks[]); 603 */ 604FUNC pmull_gcm_encrypt , : 605 pmull_gcm_do_crypt 1 606END_FUNC pmull_gcm_encrypt 607 608/* 609 * void pmull_gcm_decrypt(int blocks, uint64_t dg[2], uint8_t dst[], 610 * const uint8_t src[], 611 * const struct internal_ghash_key *ghash_key, 612 * uint64_t ctr[], const uint64_t rk[], int rounds); 613 */ 614FUNC pmull_gcm_decrypt , : 615 pmull_gcm_do_crypt 0 616END_FUNC pmull_gcm_decrypt 617 618/* 619 * void pmull_gcm_encrypt_block(uint8_t dst[], const uint8_t src[], int rounds) 620 */ 621FUNC pmull_gcm_encrypt_block , : 622 ld1 {v0.16b}, [x1] 623 enc_block v0, w2 624 st1 {v0.16b}, [x0] 625 ret 626END_FUNC pmull_gcm_encrypt_block 627 628/* 629 * void pmull_gcm_load_round_keys(const uint64_t rk[30], int rounds) 630 */ 631FUNC pmull_gcm_load_round_keys , : 632 load_round_keys w1, x0 633 ret 634END_FUNC pmull_gcm_load_round_keys 635 636/* 637 * uint32_t pmull_gcm_aes_sub(uint32_t input) 638 * 639 * use the aese instruction to perform the AES sbox substitution 640 * on each byte in 'input' 641 */ 642FUNC pmull_gcm_aes_sub , : 643 dup v1.4s, w0 644 movi v0.16b, #0 645 aese v0.16b, v1.16b 646 umov w0, v0.s[0] 647 ret 648END_FUNC pmull_gcm_aes_sub 649 650BTI(emit_aarch64_feature_1_and GNU_PROPERTY_AARCH64_FEATURE_1_BTI) 651