1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 4 * 5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 .text 12 .arch armv8-a 13 .fpu crypto-neon-fp-armv8 14 .align 3 15 16 .macro enc_round, state, key 17 aese.8 \state, \key 18 aesmc.8 \state, \state 19 .endm 20 21 .macro dec_round, state, key 22 aesd.8 \state, \key 23 aesimc.8 \state, \state 24 .endm 25 26 .macro enc_dround, key1, key2 27 enc_round q0, \key1 28 enc_round q0, \key2 29 .endm 30 31 .macro dec_dround, key1, key2 32 dec_round q0, \key1 33 dec_round q0, \key2 34 .endm 35 36 .macro enc_fround, key1, key2, key3 37 enc_round q0, \key1 38 aese.8 q0, \key2 39 veor q0, q0, \key3 40 .endm 41 42 .macro dec_fround, key1, key2, key3 43 dec_round q0, \key1 44 aesd.8 q0, \key2 45 veor q0, q0, \key3 46 .endm 47 48 .macro enc_dround_4x, key1, key2 49 enc_round q0, \key1 50 enc_round q1, \key1 51 enc_round q2, \key1 52 enc_round q3, \key1 53 enc_round q0, \key2 54 enc_round q1, \key2 55 enc_round q2, \key2 56 enc_round q3, \key2 57 .endm 58 59 .macro dec_dround_4x, key1, key2 60 dec_round q0, \key1 61 dec_round q1, \key1 62 dec_round q2, \key1 63 dec_round q3, \key1 64 dec_round q0, \key2 65 dec_round q1, \key2 66 dec_round q2, \key2 67 dec_round q3, \key2 68 .endm 69 70 .macro enc_fround_4x, key1, key2, key3 71 enc_round q0, \key1 72 enc_round q1, \key1 73 enc_round q2, \key1 74 enc_round q3, \key1 75 aese.8 q0, \key2 76 aese.8 q1, \key2 77 aese.8 q2, \key2 78 aese.8 q3, \key2 79 veor q0, q0, \key3 80 veor q1, q1, \key3 81 veor q2, q2, \key3 82 veor q3, q3, \key3 83 .endm 84 85 .macro dec_fround_4x, key1, key2, key3 86 dec_round q0, \key1 87 dec_round q1, \key1 88 dec_round q2, \key1 89 dec_round q3, \key1 90 aesd.8 q0, \key2 91 aesd.8 q1, \key2 92 aesd.8 q2, \key2 93 aesd.8 q3, \key2 94 veor q0, q0, \key3 95 veor q1, q1, \key3 96 veor q2, q2, \key3 97 veor q3, q3, \key3 98 .endm 99 100 .macro do_block, dround, fround 101 cmp r3, #12 @ which key size? 102 vld1.32 {q10-q11}, [ip]! 103 \dround q8, q9 104 vld1.32 {q12-q13}, [ip]! 105 \dround q10, q11 106 vld1.32 {q10-q11}, [ip]! 107 \dround q12, q13 108 vld1.32 {q12-q13}, [ip]! 109 \dround q10, q11 110 blo 0f @ AES-128: 10 rounds 111 vld1.32 {q10-q11}, [ip]! 112 \dround q12, q13 113 beq 1f @ AES-192: 12 rounds 114 vld1.32 {q12-q13}, [ip] 115 \dround q10, q11 1160: \fround q12, q13, q14 117 bx lr 118 1191: \fround q10, q11, q14 120 bx lr 121 .endm 122 123 /* 124 * Internal, non-AAPCS compliant functions that implement the core AES 125 * transforms. These should preserve all registers except q0 - q2 and ip 126 * Arguments: 127 * q0 : first in/output block 128 * q1 : second in/output block (_4x version only) 129 * q2 : third in/output block (_4x version only) 130 * q3 : fourth in/output block (_4x version only) 131 * q8 : first round key 132 * q9 : secound round key 133 * q14 : final round key 134 * r2 : address of round key array 135 * r3 : number of rounds 136 */ 137 .align 6 138aes_encrypt: 139 add ip, r2, #32 @ 3rd round key 140.Laes_encrypt_tweak: 141 do_block enc_dround, enc_fround 142ENDPROC(aes_encrypt) 143 144 .align 6 145aes_decrypt: 146 add ip, r2, #32 @ 3rd round key 147 do_block dec_dround, dec_fround 148ENDPROC(aes_decrypt) 149 150 .align 6 151aes_encrypt_4x: 152 add ip, r2, #32 @ 3rd round key 153 do_block enc_dround_4x, enc_fround_4x 154ENDPROC(aes_encrypt_4x) 155 156 .align 6 157aes_decrypt_4x: 158 add ip, r2, #32 @ 3rd round key 159 do_block dec_dround_4x, dec_fround_4x 160ENDPROC(aes_decrypt_4x) 161 162 .macro prepare_key, rk, rounds 163 add ip, \rk, \rounds, lsl #4 164 vld1.32 {q8-q9}, [\rk] @ load first 2 round keys 165 vld1.32 {q14}, [ip] @ load last round key 166 .endm 167 168 /* 169 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 170 * int blocks) 171 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 172 * int blocks) 173 */ 174ENTRY(ce_aes_ecb_encrypt) 175 push {r4, lr} 176 ldr r4, [sp, #8] 177 prepare_key r2, r3 178.Lecbencloop4x: 179 subs r4, r4, #4 180 bmi .Lecbenc1x 181 vld1.8 {q0-q1}, [r1]! 182 vld1.8 {q2-q3}, [r1]! 183 bl aes_encrypt_4x 184 vst1.8 {q0-q1}, [r0]! 185 vst1.8 {q2-q3}, [r0]! 186 b .Lecbencloop4x 187.Lecbenc1x: 188 adds r4, r4, #4 189 beq .Lecbencout 190.Lecbencloop: 191 vld1.8 {q0}, [r1]! 192 bl aes_encrypt 193 vst1.8 {q0}, [r0]! 194 subs r4, r4, #1 195 bne .Lecbencloop 196.Lecbencout: 197 pop {r4, pc} 198ENDPROC(ce_aes_ecb_encrypt) 199 200ENTRY(ce_aes_ecb_decrypt) 201 push {r4, lr} 202 ldr r4, [sp, #8] 203 prepare_key r2, r3 204.Lecbdecloop4x: 205 subs r4, r4, #4 206 bmi .Lecbdec1x 207 vld1.8 {q0-q1}, [r1]! 208 vld1.8 {q2-q3}, [r1]! 209 bl aes_decrypt_4x 210 vst1.8 {q0-q1}, [r0]! 211 vst1.8 {q2-q3}, [r0]! 212 b .Lecbdecloop4x 213.Lecbdec1x: 214 adds r4, r4, #4 215 beq .Lecbdecout 216.Lecbdecloop: 217 vld1.8 {q0}, [r1]! 218 bl aes_decrypt 219 vst1.8 {q0}, [r0]! 220 subs r4, r4, #1 221 bne .Lecbdecloop 222.Lecbdecout: 223 pop {r4, pc} 224ENDPROC(ce_aes_ecb_decrypt) 225 226 /* 227 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 228 * int blocks, u8 iv[]) 229 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 230 * int blocks, u8 iv[]) 231 */ 232ENTRY(ce_aes_cbc_encrypt) 233 push {r4-r6, lr} 234 ldrd r4, r5, [sp, #16] 235 vld1.8 {q0}, [r5] 236 prepare_key r2, r3 237.Lcbcencloop: 238 vld1.8 {q1}, [r1]! @ get next pt block 239 veor q0, q0, q1 @ ..and xor with iv 240 bl aes_encrypt 241 vst1.8 {q0}, [r0]! 242 subs r4, r4, #1 243 bne .Lcbcencloop 244 vst1.8 {q0}, [r5] 245 pop {r4-r6, pc} 246ENDPROC(ce_aes_cbc_encrypt) 247 248ENTRY(ce_aes_cbc_decrypt) 249 push {r4-r6, lr} 250 ldrd r4, r5, [sp, #16] 251 vld1.8 {q15}, [r5] @ keep iv in q15 252 prepare_key r2, r3 253.Lcbcdecloop4x: 254 subs r4, r4, #4 255 bmi .Lcbcdec1x 256 vld1.8 {q0-q1}, [r1]! 257 vld1.8 {q2-q3}, [r1]! 258 vmov q4, q0 259 vmov q5, q1 260 vmov q6, q2 261 vmov q7, q3 262 bl aes_decrypt_4x 263 veor q0, q0, q15 264 veor q1, q1, q4 265 veor q2, q2, q5 266 veor q3, q3, q6 267 vmov q15, q7 268 vst1.8 {q0-q1}, [r0]! 269 vst1.8 {q2-q3}, [r0]! 270 b .Lcbcdecloop4x 271.Lcbcdec1x: 272 adds r4, r4, #4 273 beq .Lcbcdecout 274 vmov q6, q14 @ preserve last round key 275.Lcbcdecloop: 276 vld1.8 {q0}, [r1]! @ get next ct block 277 veor q14, q15, q6 @ combine prev ct with last key 278 vmov q15, q0 279 bl aes_decrypt 280 vst1.8 {q0}, [r0]! 281 subs r4, r4, #1 282 bne .Lcbcdecloop 283.Lcbcdecout: 284 vst1.8 {q15}, [r5] @ keep iv in q15 285 pop {r4-r6, pc} 286ENDPROC(ce_aes_cbc_decrypt) 287 288 289 /* 290 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], 291 * int rounds, int bytes, u8 const iv[]) 292 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], 293 * int rounds, int bytes, u8 const iv[]) 294 */ 295 296ENTRY(ce_aes_cbc_cts_encrypt) 297 push {r4-r6, lr} 298 ldrd r4, r5, [sp, #16] 299 300 movw ip, :lower16:.Lcts_permute_table 301 movt ip, :upper16:.Lcts_permute_table 302 sub r4, r4, #16 303 add lr, ip, #32 304 add ip, ip, r4 305 sub lr, lr, r4 306 vld1.8 {q5}, [ip] 307 vld1.8 {q6}, [lr] 308 309 add ip, r1, r4 310 vld1.8 {q0}, [r1] @ overlapping loads 311 vld1.8 {q3}, [ip] 312 313 vld1.8 {q1}, [r5] @ get iv 314 prepare_key r2, r3 315 316 veor q0, q0, q1 @ xor with iv 317 bl aes_encrypt 318 319 vtbl.8 d4, {d0-d1}, d10 320 vtbl.8 d5, {d0-d1}, d11 321 vtbl.8 d2, {d6-d7}, d12 322 vtbl.8 d3, {d6-d7}, d13 323 324 veor q0, q0, q1 325 bl aes_encrypt 326 327 add r4, r0, r4 328 vst1.8 {q2}, [r4] @ overlapping stores 329 vst1.8 {q0}, [r0] 330 331 pop {r4-r6, pc} 332ENDPROC(ce_aes_cbc_cts_encrypt) 333 334ENTRY(ce_aes_cbc_cts_decrypt) 335 push {r4-r6, lr} 336 ldrd r4, r5, [sp, #16] 337 338 movw ip, :lower16:.Lcts_permute_table 339 movt ip, :upper16:.Lcts_permute_table 340 sub r4, r4, #16 341 add lr, ip, #32 342 add ip, ip, r4 343 sub lr, lr, r4 344 vld1.8 {q5}, [ip] 345 vld1.8 {q6}, [lr] 346 347 add ip, r1, r4 348 vld1.8 {q0}, [r1] @ overlapping loads 349 vld1.8 {q1}, [ip] 350 351 vld1.8 {q3}, [r5] @ get iv 352 prepare_key r2, r3 353 354 bl aes_decrypt 355 356 vtbl.8 d4, {d0-d1}, d10 357 vtbl.8 d5, {d0-d1}, d11 358 vtbx.8 d0, {d2-d3}, d12 359 vtbx.8 d1, {d2-d3}, d13 360 361 veor q1, q1, q2 362 bl aes_decrypt 363 veor q0, q0, q3 @ xor with iv 364 365 add r4, r0, r4 366 vst1.8 {q1}, [r4] @ overlapping stores 367 vst1.8 {q0}, [r0] 368 369 pop {r4-r6, pc} 370ENDPROC(ce_aes_cbc_cts_decrypt) 371 372 373 /* 374 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, 375 * int blocks, u8 ctr[]) 376 */ 377ENTRY(ce_aes_ctr_encrypt) 378 push {r4-r6, lr} 379 ldrd r4, r5, [sp, #16] 380 vld1.8 {q7}, [r5] @ load ctr 381 prepare_key r2, r3 382 vmov r6, s31 @ keep swabbed ctr in r6 383 rev r6, r6 384 cmn r6, r4 @ 32 bit overflow? 385 bcs .Lctrloop 386.Lctrloop4x: 387 subs r4, r4, #4 388 bmi .Lctr1x 389 390 /* 391 * NOTE: the sequence below has been carefully tweaked to avoid 392 * a silicon erratum that exists in Cortex-A57 (#1742098) and 393 * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs 394 * may produce an incorrect result if they take their input from a 395 * register of which a single 32-bit lane has been updated the last 396 * time it was modified. To work around this, the lanes of registers 397 * q0-q3 below are not manipulated individually, and the different 398 * counter values are prepared by successive manipulations of q7. 399 */ 400 add ip, r6, #1 401 vmov q0, q7 402 rev ip, ip 403 add lr, r6, #2 404 vmov s31, ip @ set lane 3 of q1 via q7 405 add ip, r6, #3 406 rev lr, lr 407 vmov q1, q7 408 vmov s31, lr @ set lane 3 of q2 via q7 409 rev ip, ip 410 vmov q2, q7 411 vmov s31, ip @ set lane 3 of q3 via q7 412 add r6, r6, #4 413 vmov q3, q7 414 415 vld1.8 {q4-q5}, [r1]! 416 vld1.8 {q6}, [r1]! 417 vld1.8 {q15}, [r1]! 418 bl aes_encrypt_4x 419 veor q0, q0, q4 420 veor q1, q1, q5 421 veor q2, q2, q6 422 veor q3, q3, q15 423 rev ip, r6 424 vst1.8 {q0-q1}, [r0]! 425 vst1.8 {q2-q3}, [r0]! 426 vmov s31, ip 427 b .Lctrloop4x 428.Lctr1x: 429 adds r4, r4, #4 430 beq .Lctrout 431.Lctrloop: 432 vmov q0, q7 433 bl aes_encrypt 434 435 adds r6, r6, #1 @ increment BE ctr 436 rev ip, r6 437 vmov s31, ip 438 bcs .Lctrcarry 439 440.Lctrcarrydone: 441 subs r4, r4, #1 442 bmi .Lctrtailblock @ blocks < 0 means tail block 443 vld1.8 {q3}, [r1]! 444 veor q3, q0, q3 445 vst1.8 {q3}, [r0]! 446 bne .Lctrloop 447 448.Lctrout: 449 vst1.8 {q7}, [r5] @ return next CTR value 450 pop {r4-r6, pc} 451 452.Lctrtailblock: 453 vst1.8 {q0}, [r0, :64] @ return the key stream 454 b .Lctrout 455 456.Lctrcarry: 457 .irp sreg, s30, s29, s28 458 vmov ip, \sreg @ load next word of ctr 459 rev ip, ip @ ... to handle the carry 460 adds ip, ip, #1 461 rev ip, ip 462 vmov \sreg, ip 463 bcc .Lctrcarrydone 464 .endr 465 b .Lctrcarrydone 466ENDPROC(ce_aes_ctr_encrypt) 467 468 /* 469 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, 470 * int bytes, u8 iv[], u32 const rk2[], int first) 471 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, 472 * int bytes, u8 iv[], u32 const rk2[], int first) 473 */ 474 475 .macro next_tweak, out, in, const, tmp 476 vshr.s64 \tmp, \in, #63 477 vand \tmp, \tmp, \const 478 vadd.u64 \out, \in, \in 479 vext.8 \tmp, \tmp, \tmp, #8 480 veor \out, \out, \tmp 481 .endm 482 483ce_aes_xts_init: 484 vmov.i32 d30, #0x87 @ compose tweak mask vector 485 vmovl.u32 q15, d30 486 vshr.u64 d30, d31, #7 487 488 ldrd r4, r5, [sp, #16] @ load args 489 ldr r6, [sp, #28] 490 vld1.8 {q0}, [r5] @ load iv 491 teq r6, #1 @ start of a block? 492 bxne lr 493 494 @ Encrypt the IV in q0 with the second AES key. This should only 495 @ be done at the start of a block. 496 ldr r6, [sp, #24] @ load AES key 2 497 prepare_key r6, r3 498 add ip, r6, #32 @ 3rd round key of key 2 499 b .Laes_encrypt_tweak @ tail call 500ENDPROC(ce_aes_xts_init) 501 502ENTRY(ce_aes_xts_encrypt) 503 push {r4-r6, lr} 504 505 bl ce_aes_xts_init @ run shared prologue 506 prepare_key r2, r3 507 vmov q4, q0 508 509 teq r6, #0 @ start of a block? 510 bne .Lxtsenc4x 511 512.Lxtsencloop4x: 513 next_tweak q4, q4, q15, q10 514.Lxtsenc4x: 515 subs r4, r4, #64 516 bmi .Lxtsenc1x 517 vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks 518 vld1.8 {q2-q3}, [r1]! 519 next_tweak q5, q4, q15, q10 520 veor q0, q0, q4 521 next_tweak q6, q5, q15, q10 522 veor q1, q1, q5 523 next_tweak q7, q6, q15, q10 524 veor q2, q2, q6 525 veor q3, q3, q7 526 bl aes_encrypt_4x 527 veor q0, q0, q4 528 veor q1, q1, q5 529 veor q2, q2, q6 530 veor q3, q3, q7 531 vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks 532 vst1.8 {q2-q3}, [r0]! 533 vmov q4, q7 534 teq r4, #0 535 beq .Lxtsencret 536 b .Lxtsencloop4x 537.Lxtsenc1x: 538 adds r4, r4, #64 539 beq .Lxtsencout 540 subs r4, r4, #16 541 bmi .LxtsencctsNx 542.Lxtsencloop: 543 vld1.8 {q0}, [r1]! 544.Lxtsencctsout: 545 veor q0, q0, q4 546 bl aes_encrypt 547 veor q0, q0, q4 548 teq r4, #0 549 beq .Lxtsencout 550 subs r4, r4, #16 551 next_tweak q4, q4, q15, q6 552 bmi .Lxtsenccts 553 vst1.8 {q0}, [r0]! 554 b .Lxtsencloop 555.Lxtsencout: 556 vst1.8 {q0}, [r0] 557.Lxtsencret: 558 vst1.8 {q4}, [r5] 559 pop {r4-r6, pc} 560 561.LxtsencctsNx: 562 vmov q0, q3 563 sub r0, r0, #16 564.Lxtsenccts: 565 movw ip, :lower16:.Lcts_permute_table 566 movt ip, :upper16:.Lcts_permute_table 567 568 add r1, r1, r4 @ rewind input pointer 569 add r4, r4, #16 @ # bytes in final block 570 add lr, ip, #32 571 add ip, ip, r4 572 sub lr, lr, r4 573 add r4, r0, r4 @ output address of final block 574 575 vld1.8 {q1}, [r1] @ load final partial block 576 vld1.8 {q2}, [ip] 577 vld1.8 {q3}, [lr] 578 579 vtbl.8 d4, {d0-d1}, d4 580 vtbl.8 d5, {d0-d1}, d5 581 vtbx.8 d0, {d2-d3}, d6 582 vtbx.8 d1, {d2-d3}, d7 583 584 vst1.8 {q2}, [r4] @ overlapping stores 585 mov r4, #0 586 b .Lxtsencctsout 587ENDPROC(ce_aes_xts_encrypt) 588 589 590ENTRY(ce_aes_xts_decrypt) 591 push {r4-r6, lr} 592 593 bl ce_aes_xts_init @ run shared prologue 594 prepare_key r2, r3 595 vmov q4, q0 596 597 /* subtract 16 bytes if we are doing CTS */ 598 tst r4, #0xf 599 subne r4, r4, #0x10 600 601 teq r6, #0 @ start of a block? 602 bne .Lxtsdec4x 603 604.Lxtsdecloop4x: 605 next_tweak q4, q4, q15, q10 606.Lxtsdec4x: 607 subs r4, r4, #64 608 bmi .Lxtsdec1x 609 vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks 610 vld1.8 {q2-q3}, [r1]! 611 next_tweak q5, q4, q15, q10 612 veor q0, q0, q4 613 next_tweak q6, q5, q15, q10 614 veor q1, q1, q5 615 next_tweak q7, q6, q15, q10 616 veor q2, q2, q6 617 veor q3, q3, q7 618 bl aes_decrypt_4x 619 veor q0, q0, q4 620 veor q1, q1, q5 621 veor q2, q2, q6 622 veor q3, q3, q7 623 vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks 624 vst1.8 {q2-q3}, [r0]! 625 vmov q4, q7 626 teq r4, #0 627 beq .Lxtsdecout 628 b .Lxtsdecloop4x 629.Lxtsdec1x: 630 adds r4, r4, #64 631 beq .Lxtsdecout 632 subs r4, r4, #16 633.Lxtsdecloop: 634 vld1.8 {q0}, [r1]! 635 bmi .Lxtsdeccts 636.Lxtsdecctsout: 637 veor q0, q0, q4 638 bl aes_decrypt 639 veor q0, q0, q4 640 vst1.8 {q0}, [r0]! 641 teq r4, #0 642 beq .Lxtsdecout 643 subs r4, r4, #16 644 next_tweak q4, q4, q15, q6 645 b .Lxtsdecloop 646.Lxtsdecout: 647 vst1.8 {q4}, [r5] 648 pop {r4-r6, pc} 649 650.Lxtsdeccts: 651 movw ip, :lower16:.Lcts_permute_table 652 movt ip, :upper16:.Lcts_permute_table 653 654 add r1, r1, r4 @ rewind input pointer 655 add r4, r4, #16 @ # bytes in final block 656 add lr, ip, #32 657 add ip, ip, r4 658 sub lr, lr, r4 659 add r4, r0, r4 @ output address of final block 660 661 next_tweak q5, q4, q15, q6 662 663 vld1.8 {q1}, [r1] @ load final partial block 664 vld1.8 {q2}, [ip] 665 vld1.8 {q3}, [lr] 666 667 veor q0, q0, q5 668 bl aes_decrypt 669 veor q0, q0, q5 670 671 vtbl.8 d4, {d0-d1}, d4 672 vtbl.8 d5, {d0-d1}, d5 673 vtbx.8 d0, {d2-d3}, d6 674 vtbx.8 d1, {d2-d3}, d7 675 676 vst1.8 {q2}, [r4] @ overlapping stores 677 mov r4, #0 678 b .Lxtsdecctsout 679ENDPROC(ce_aes_xts_decrypt) 680 681 /* 682 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 683 * AES sbox substitution on each byte in 684 * 'input' 685 */ 686ENTRY(ce_aes_sub) 687 vdup.32 q1, r0 688 veor q0, q0, q0 689 aese.8 q0, q1 690 vmov r0, s0 691 bx lr 692ENDPROC(ce_aes_sub) 693 694 /* 695 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns 696 * operation on round key *src 697 */ 698ENTRY(ce_aes_invert) 699 vld1.32 {q0}, [r1] 700 aesimc.8 q0, q0 701 vst1.32 {q0}, [r0] 702 bx lr 703ENDPROC(ce_aes_invert) 704 705 .section ".rodata", "a" 706 .align 6 707.Lcts_permute_table: 708 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 709 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 710 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 711 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 712 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 713 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 714