1/* Optimized memcmp implementation for POWER7/PowerPC64. 2 Copyright (C) 2010-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* int [r3] memcmp (const char *s1 [r3], 22 const char *s2 [r4], 23 size_t size [r5]) */ 24#ifndef MEMCMP 25# define MEMCMP memcmp 26#endif 27 .machine power7 28ENTRY_TOCLESS (MEMCMP, 4) 29 CALL_MCOUNT 3 30 31#define rRTN r3 32#define rSTR1 r3 /* first string arg */ 33#define rSTR2 r4 /* second string arg */ 34#define rN r5 /* max string length */ 35#define rWORD1 r6 /* current word in s1 */ 36#define rWORD2 r7 /* current word in s2 */ 37#define rWORD3 r8 /* next word in s1 */ 38#define rWORD4 r9 /* next word in s2 */ 39#define rWORD5 r10 /* next word in s1 */ 40#define rWORD6 r11 /* next word in s2 */ 41 42#define rOFF8 r20 /* 8 bytes offset. */ 43#define rOFF16 r21 /* 16 bytes offset. */ 44#define rOFF24 r22 /* 24 bytes offset. */ 45#define rOFF32 r23 /* 24 bytes offset. */ 46#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ 47#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ 48#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ 49#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ 50#define rSHR r28 /* Unaligned shift right count. */ 51#define rSHL r29 /* Unaligned shift left count. */ 52#define rWORD7 r30 /* next word in s1 */ 53#define rWORD8 r31 /* next word in s2 */ 54 55#define rWORD8SAVE (-8) 56#define rWORD7SAVE (-16) 57#define rOFF8SAVE (-24) 58#define rOFF16SAVE (-32) 59#define rOFF24SAVE (-40) 60#define rOFF32SAVE (-48) 61#define rSHRSAVE (-56) 62#define rSHLSAVE (-64) 63#define rWORD8SHIFTSAVE (-72) 64#define rWORD2SHIFTSAVE (-80) 65#define rWORD4SHIFTSAVE (-88) 66#define rWORD6SHIFTSAVE (-96) 67 68#ifdef __LITTLE_ENDIAN__ 69# define LD ldbrx 70#else 71# define LD ldx 72#endif 73 74 xor r0, rSTR2, rSTR1 75 cmpldi cr6, rN, 0 76 cmpldi cr1, rN, 12 77 clrldi. r0, r0, 61 78 clrldi r12, rSTR1, 61 79 cmpldi cr5, r12, 0 80 beq- cr6, L(zeroLength) 81 dcbt 0, rSTR1 82 dcbt 0, rSTR2 83/* If less than 8 bytes or not aligned, use the unaligned 84 byte loop. */ 85 blt cr1, L(bytealigned) 86 std rWORD8, rWORD8SAVE(r1) 87 std rWORD7, rWORD7SAVE(r1) 88 std rOFF8, rOFF8SAVE(r1) 89 std rOFF16, rOFF16SAVE(r1) 90 std rOFF24, rOFF24SAVE(r1) 91 std rOFF32, rOFF32SAVE(r1) 92 cfi_offset(rWORD8, rWORD8SAVE) 93 cfi_offset(rWORD7, rWORD7SAVE) 94 cfi_offset(rOFF8, rOFF8SAVE) 95 cfi_offset(rOFF16, rOFF16SAVE) 96 cfi_offset(rOFF24, rOFF24SAVE) 97 cfi_offset(rOFF32, rOFF32SAVE) 98 99 li rOFF8,8 100 li rOFF16,16 101 li rOFF24,24 102 li rOFF32,32 103 104 bne L(unaligned) 105/* At this point we know both strings have the same alignment and the 106 compare length is at least 8 bytes. r12 contains the low order 107 3 bits of rSTR1 and cr5 contains the result of the logical compare 108 of r12 to 0. If r12 == 0 then we are already double word 109 aligned and can perform the DW aligned loop. 110 111 Otherwise we know the two strings have the same alignment (but not 112 yet DW). So we force the string addresses to the next lower DW 113 boundary and special case this first DW using shift left to 114 eliminate bits preceding the first byte. Since we want to join the 115 normal (DW aligned) compare loop, starting at the second double word, 116 we need to adjust the length (rN) and special case the loop 117 versioning for the first DW. This ensures that the loop count is 118 correct and the first DW (shifted) is in the expected register pair. */ 119 .align 4 120L(samealignment): 121 clrrdi rSTR1, rSTR1, 3 122 clrrdi rSTR2, rSTR2, 3 123 beq cr5, L(DWaligned) 124 add rN, rN, r12 125 sldi rWORD6, r12, 3 126 srdi r0, rN, 5 /* Divide by 32 */ 127 andi. r12, rN, 24 /* Get the DW remainder */ 128 LD rWORD1, 0, rSTR1 129 LD rWORD2, 0, rSTR2 130 cmpldi cr1, r12, 16 131 cmpldi cr7, rN, 32 132 clrldi rN, rN, 61 133 beq L(dPs4) 134 mtctr r0 135 bgt cr1, L(dPs3) 136 beq cr1, L(dPs2) 137 138/* Remainder is 8 */ 139 .align 3 140L(dsP1): 141 sld rWORD5, rWORD1, rWORD6 142 sld rWORD6, rWORD2, rWORD6 143 cmpld cr5, rWORD5, rWORD6 144 blt cr7, L(dP1x) 145/* Do something useful in this cycle since we have to branch anyway. */ 146 LD rWORD1, rOFF8, rSTR1 147 LD rWORD2, rOFF8, rSTR2 148 cmpld cr7, rWORD1, rWORD2 149 b L(dP1e) 150/* Remainder is 16 */ 151 .align 4 152L(dPs2): 153 sld rWORD5, rWORD1, rWORD6 154 sld rWORD6, rWORD2, rWORD6 155 cmpld cr6, rWORD5, rWORD6 156 blt cr7, L(dP2x) 157/* Do something useful in this cycle since we have to branch anyway. */ 158 LD rWORD7, rOFF8, rSTR1 159 LD rWORD8, rOFF8, rSTR2 160 cmpld cr5, rWORD7, rWORD8 161 b L(dP2e) 162/* Remainder is 24 */ 163 .align 4 164L(dPs3): 165 sld rWORD3, rWORD1, rWORD6 166 sld rWORD4, rWORD2, rWORD6 167 cmpld cr1, rWORD3, rWORD4 168 b L(dP3e) 169/* Count is a multiple of 32, remainder is 0 */ 170 .align 4 171L(dPs4): 172 mtctr r0 173 sld rWORD1, rWORD1, rWORD6 174 sld rWORD2, rWORD2, rWORD6 175 cmpld cr7, rWORD1, rWORD2 176 b L(dP4e) 177 178/* At this point we know both strings are double word aligned and the 179 compare length is at least 8 bytes. */ 180 .align 4 181L(DWaligned): 182 andi. r12, rN, 24 /* Get the DW remainder */ 183 srdi r0, rN, 5 /* Divide by 32 */ 184 cmpldi cr1, r12, 16 185 cmpldi cr7, rN, 32 186 clrldi rN, rN, 61 187 beq L(dP4) 188 bgt cr1, L(dP3) 189 beq cr1, L(dP2) 190 191/* Remainder is 8 */ 192 .align 4 193L(dP1): 194 mtctr r0 195/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early 196 (8-15 byte compare), we want to use only volatile registers. This 197 means we can avoid restoring non-volatile registers since we did not 198 change any on the early exit path. The key here is the non-early 199 exit path only cares about the condition code (cr5), not about which 200 register pair was used. */ 201 LD rWORD5, 0, rSTR1 202 LD rWORD6, 0, rSTR2 203 cmpld cr5, rWORD5, rWORD6 204 blt cr7, L(dP1x) 205 LD rWORD1, rOFF8, rSTR1 206 LD rWORD2, rOFF8, rSTR2 207 cmpld cr7, rWORD1, rWORD2 208L(dP1e): 209 LD rWORD3, rOFF16, rSTR1 210 LD rWORD4, rOFF16, rSTR2 211 cmpld cr1, rWORD3, rWORD4 212 LD rWORD5, rOFF24, rSTR1 213 LD rWORD6, rOFF24, rSTR2 214 cmpld cr6, rWORD5, rWORD6 215 bne cr5, L(dLcr5x) 216 bne cr7, L(dLcr7x) 217 218 LD rWORD7, rOFF32, rSTR1 219 LD rWORD8, rOFF32, rSTR2 220 addi rSTR1, rSTR1, 32 221 addi rSTR2, rSTR2, 32 222 bne cr1, L(dLcr1) 223 cmpld cr5, rWORD7, rWORD8 224 bdnz L(dLoop) 225 bne cr6, L(dLcr6) 226 ld rWORD8, rWORD8SAVE(r1) 227 ld rWORD7, rWORD7SAVE(r1) 228 .align 3 229L(dP1x): 230 sldi. r12, rN, 3 231 bne cr5, L(dLcr5x) 232 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 233 bne L(d00) 234 ld rOFF8, rOFF8SAVE(r1) 235 ld rOFF16, rOFF16SAVE(r1) 236 ld rOFF24, rOFF24SAVE(r1) 237 ld rOFF32, rOFF32SAVE(r1) 238 li rRTN, 0 239 blr 240 241/* Remainder is 16 */ 242 .align 4 243L(dP2): 244 mtctr r0 245 LD rWORD5, 0, rSTR1 246 LD rWORD6, 0, rSTR2 247 cmpld cr6, rWORD5, rWORD6 248 blt cr7, L(dP2x) 249 LD rWORD7, rOFF8, rSTR1 250 LD rWORD8, rOFF8, rSTR2 251 cmpld cr5, rWORD7, rWORD8 252L(dP2e): 253 LD rWORD1, rOFF16, rSTR1 254 LD rWORD2, rOFF16, rSTR2 255 cmpld cr7, rWORD1, rWORD2 256 LD rWORD3, rOFF24, rSTR1 257 LD rWORD4, rOFF24, rSTR2 258 cmpld cr1, rWORD3, rWORD4 259 addi rSTR1, rSTR1, 8 260 addi rSTR2, rSTR2, 8 261 bne cr6, L(dLcr6) 262 bne cr5, L(dLcr5) 263 b L(dLoop2) 264 .align 4 265L(dP2x): 266 LD rWORD3, rOFF8, rSTR1 267 LD rWORD4, rOFF8, rSTR2 268 cmpld cr1, rWORD3, rWORD4 269 sldi. r12, rN, 3 270 bne cr6, L(dLcr6x) 271 addi rSTR1, rSTR1, 8 272 addi rSTR2, rSTR2, 8 273 bne cr1, L(dLcr1x) 274 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 275 bne L(d00) 276 ld rOFF8, rOFF8SAVE(r1) 277 ld rOFF16, rOFF16SAVE(r1) 278 ld rOFF24, rOFF24SAVE(r1) 279 ld rOFF32, rOFF32SAVE(r1) 280 li rRTN, 0 281 blr 282 283/* Remainder is 24 */ 284 .align 4 285L(dP3): 286 mtctr r0 287 LD rWORD3, 0, rSTR1 288 LD rWORD4, 0, rSTR2 289 cmpld cr1, rWORD3, rWORD4 290L(dP3e): 291 LD rWORD5, rOFF8, rSTR1 292 LD rWORD6, rOFF8, rSTR2 293 cmpld cr6, rWORD5, rWORD6 294 blt cr7, L(dP3x) 295 LD rWORD7, rOFF16, rSTR1 296 LD rWORD8, rOFF16, rSTR2 297 cmpld cr5, rWORD7, rWORD8 298 LD rWORD1, rOFF24, rSTR1 299 LD rWORD2, rOFF24, rSTR2 300 cmpld cr7, rWORD1, rWORD2 301 addi rSTR1, rSTR1, 16 302 addi rSTR2, rSTR2, 16 303 bne cr1, L(dLcr1) 304 bne cr6, L(dLcr6) 305 b L(dLoop1) 306/* Again we are on a early exit path (24-31 byte compare), we want to 307 only use volatile registers and avoid restoring non-volatile 308 registers. */ 309 .align 4 310L(dP3x): 311 LD rWORD1, rOFF16, rSTR1 312 LD rWORD2, rOFF16, rSTR2 313 cmpld cr7, rWORD1, rWORD2 314 sldi. r12, rN, 3 315 bne cr1, L(dLcr1x) 316 addi rSTR1, rSTR1, 16 317 addi rSTR2, rSTR2, 16 318 bne cr6, L(dLcr6x) 319 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 320 bne cr7, L(dLcr7x) 321 bne L(d00) 322 ld rOFF8, rOFF8SAVE(r1) 323 ld rOFF16, rOFF16SAVE(r1) 324 ld rOFF24, rOFF24SAVE(r1) 325 ld rOFF32, rOFF32SAVE(r1) 326 li rRTN, 0 327 blr 328 329/* Count is a multiple of 32, remainder is 0 */ 330 .align 4 331L(dP4): 332 mtctr r0 333 LD rWORD1, 0, rSTR1 334 LD rWORD2, 0, rSTR2 335 cmpld cr7, rWORD1, rWORD2 336L(dP4e): 337 LD rWORD3, rOFF8, rSTR1 338 LD rWORD4, rOFF8, rSTR2 339 cmpld cr1, rWORD3, rWORD4 340 LD rWORD5, rOFF16, rSTR1 341 LD rWORD6, rOFF16, rSTR2 342 cmpld cr6, rWORD5, rWORD6 343 LD rWORD7, rOFF24, rSTR1 344 LD rWORD8, rOFF24, rSTR2 345 addi rSTR1, rSTR1, 24 346 addi rSTR2, rSTR2, 24 347 cmpld cr5, rWORD7, rWORD8 348 bne cr7, L(dLcr7) 349 bne cr1, L(dLcr1) 350 bdz- L(d24) /* Adjust CTR as we start with +4 */ 351/* This is the primary loop */ 352 .align 4 353L(dLoop): 354 LD rWORD1, rOFF8, rSTR1 355 LD rWORD2, rOFF8, rSTR2 356 cmpld cr1, rWORD3, rWORD4 357 bne cr6, L(dLcr6) 358L(dLoop1): 359 LD rWORD3, rOFF16, rSTR1 360 LD rWORD4, rOFF16, rSTR2 361 cmpld cr6, rWORD5, rWORD6 362 bne cr5, L(dLcr5) 363L(dLoop2): 364 LD rWORD5, rOFF24, rSTR1 365 LD rWORD6, rOFF24, rSTR2 366 cmpld cr5, rWORD7, rWORD8 367 bne cr7, L(dLcr7) 368L(dLoop3): 369 LD rWORD7, rOFF32, rSTR1 370 LD rWORD8, rOFF32, rSTR2 371 addi rSTR1, rSTR1, 32 372 addi rSTR2, rSTR2, 32 373 bne cr1, L(dLcr1) 374 cmpld cr7, rWORD1, rWORD2 375 bdnz L(dLoop) 376 377L(dL4): 378 cmpld cr1, rWORD3, rWORD4 379 bne cr6, L(dLcr6) 380 cmpld cr6, rWORD5, rWORD6 381 bne cr5, L(dLcr5) 382 cmpld cr5, rWORD7, rWORD8 383L(d44): 384 bne cr7, L(dLcr7) 385L(d34): 386 bne cr1, L(dLcr1) 387L(d24): 388 bne cr6, L(dLcr6) 389L(d14): 390 sldi. r12, rN, 3 391 bne cr5, L(dLcr5) 392L(d04): 393 ld rWORD8, rWORD8SAVE(r1) 394 ld rWORD7, rWORD7SAVE(r1) 395 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ 396 beq L(duzeroLength) 397/* At this point we have a remainder of 1 to 7 bytes to compare. Since 398 we are aligned it is safe to load the whole double word, and use 399 shift right double to eliminate bits beyond the compare length. */ 400L(d00): 401 LD rWORD1, rOFF8, rSTR1 402 LD rWORD2, rOFF8, rSTR2 403 srd rWORD1, rWORD1, rN 404 srd rWORD2, rWORD2, rN 405 cmpld cr7, rWORD1, rWORD2 406 bne cr7, L(dLcr7x) 407 ld rOFF8, rOFF8SAVE(r1) 408 ld rOFF16, rOFF16SAVE(r1) 409 ld rOFF24, rOFF24SAVE(r1) 410 ld rOFF32, rOFF32SAVE(r1) 411 li rRTN, 0 412 blr 413 414 .align 4 415L(dLcr7): 416 ld rWORD8, rWORD8SAVE(r1) 417 ld rWORD7, rWORD7SAVE(r1) 418L(dLcr7x): 419 ld rOFF8, rOFF8SAVE(r1) 420 ld rOFF16, rOFF16SAVE(r1) 421 ld rOFF24, rOFF24SAVE(r1) 422 ld rOFF32, rOFF32SAVE(r1) 423 li rRTN, 1 424 bgtlr cr7 425 li rRTN, -1 426 blr 427 .align 4 428L(dLcr1): 429 ld rWORD8, rWORD8SAVE(r1) 430 ld rWORD7, rWORD7SAVE(r1) 431L(dLcr1x): 432 ld rOFF8, rOFF8SAVE(r1) 433 ld rOFF16, rOFF16SAVE(r1) 434 ld rOFF24, rOFF24SAVE(r1) 435 ld rOFF32, rOFF32SAVE(r1) 436 li rRTN, 1 437 bgtlr cr1 438 li rRTN, -1 439 blr 440 .align 4 441L(dLcr6): 442 ld rWORD8, rWORD8SAVE(r1) 443 ld rWORD7, rWORD7SAVE(r1) 444L(dLcr6x): 445 ld rOFF8, rOFF8SAVE(r1) 446 ld rOFF16, rOFF16SAVE(r1) 447 ld rOFF24, rOFF24SAVE(r1) 448 ld rOFF32, rOFF32SAVE(r1) 449 li rRTN, 1 450 bgtlr cr6 451 li rRTN, -1 452 blr 453 .align 4 454L(dLcr5): 455 ld rWORD8, rWORD8SAVE(r1) 456 ld rWORD7, rWORD7SAVE(r1) 457L(dLcr5x): 458 ld rOFF8, rOFF8SAVE(r1) 459 ld rOFF16, rOFF16SAVE(r1) 460 ld rOFF24, rOFF24SAVE(r1) 461 ld rOFF32, rOFF32SAVE(r1) 462 li rRTN, 1 463 bgtlr cr5 464 li rRTN, -1 465 blr 466 467 .align 4 468L(bytealigned): 469 mtctr rN 470 471/* We need to prime this loop. This loop is swing modulo scheduled 472 to avoid pipe delays. The dependent instruction latencies (load to 473 compare to conditional branch) is 2 to 3 cycles. In this loop each 474 dispatch group ends in a branch and takes 1 cycle. Effectively 475 the first iteration of the loop only serves to load operands and 476 branches based on compares are delayed until the next loop. 477 478 So we must precondition some registers and condition codes so that 479 we don't exit the loop early on the first iteration. */ 480 481 lbz rWORD1, 0(rSTR1) 482 lbz rWORD2, 0(rSTR2) 483 bdz L(b11) 484 cmpld cr7, rWORD1, rWORD2 485 lbz rWORD3, 1(rSTR1) 486 lbz rWORD4, 1(rSTR2) 487 bdz L(b12) 488 cmpld cr1, rWORD3, rWORD4 489 lbzu rWORD5, 2(rSTR1) 490 lbzu rWORD6, 2(rSTR2) 491 bdz L(b13) 492 .align 4 493L(bLoop): 494 lbzu rWORD1, 1(rSTR1) 495 lbzu rWORD2, 1(rSTR2) 496 bne cr7, L(bLcr7) 497 498 cmpld cr6, rWORD5, rWORD6 499 bdz L(b3i) 500 501 lbzu rWORD3, 1(rSTR1) 502 lbzu rWORD4, 1(rSTR2) 503 bne cr1, L(bLcr1) 504 505 cmpld cr7, rWORD1, rWORD2 506 bdz L(b2i) 507 508 lbzu rWORD5, 1(rSTR1) 509 lbzu rWORD6, 1(rSTR2) 510 bne cr6, L(bLcr6) 511 512 cmpld cr1, rWORD3, rWORD4 513 bdnz L(bLoop) 514 515/* We speculatively loading bytes before we have tested the previous 516 bytes. But we must avoid overrunning the length (in the ctr) to 517 prevent these speculative loads from causing a segfault. In this 518 case the loop will exit early (before the all pending bytes are 519 tested. In this case we must complete the pending operations 520 before returning. */ 521L(b1i): 522 bne cr7, L(bLcr7) 523 bne cr1, L(bLcr1) 524 b L(bx56) 525 .align 4 526L(b2i): 527 bne cr6, L(bLcr6) 528 bne cr7, L(bLcr7) 529 b L(bx34) 530 .align 4 531L(b3i): 532 bne cr1, L(bLcr1) 533 bne cr6, L(bLcr6) 534 b L(bx12) 535 .align 4 536L(bLcr7): 537 li rRTN, 1 538 bgtlr cr7 539 li rRTN, -1 540 blr 541L(bLcr1): 542 li rRTN, 1 543 bgtlr cr1 544 li rRTN, -1 545 blr 546L(bLcr6): 547 li rRTN, 1 548 bgtlr cr6 549 li rRTN, -1 550 blr 551 552L(b13): 553 bne cr7, L(bx12) 554 bne cr1, L(bx34) 555L(bx56): 556 sub rRTN, rWORD5, rWORD6 557 blr 558 nop 559L(b12): 560 bne cr7, L(bx12) 561L(bx34): 562 sub rRTN, rWORD3, rWORD4 563 blr 564L(b11): 565L(bx12): 566 sub rRTN, rWORD1, rWORD2 567 blr 568 569 .align 4 570L(zeroLength): 571 li rRTN, 0 572 blr 573 574 .align 4 575/* At this point we know the strings have different alignment and the 576 compare length is at least 8 bytes. r12 contains the low order 577 3 bits of rSTR1 and cr5 contains the result of the logical compare 578 of r12 to 0. If r12 == 0 then rStr1 is double word 579 aligned and can perform the DWunaligned loop. 580 581 Otherwise we know that rSTR1 is not already DW aligned yet. 582 So we can force the string addresses to the next lower DW 583 boundary and special case this first DW using shift left to 584 eliminate bits preceding the first byte. Since we want to join the 585 normal (DWaligned) compare loop, starting at the second double word, 586 we need to adjust the length (rN) and special case the loop 587 versioning for the first DW. This ensures that the loop count is 588 correct and the first DW (shifted) is in the expected resister pair. */ 589L(unaligned): 590 std rSHL, rSHLSAVE(r1) 591 cfi_offset(rSHL, rSHLSAVE) 592 clrldi rSHL, rSTR2, 61 593 beq cr6, L(duzeroLength) 594 std rSHR, rSHRSAVE(r1) 595 cfi_offset(rSHR, rSHRSAVE) 596 beq cr5, L(DWunaligned) 597 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 598 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) 599/* Adjust the logical start of rSTR2 to compensate for the extra bits 600 in the 1st rSTR1 DW. */ 601 sub rWORD8_SHIFT, rSTR2, r12 602/* But do not attempt to address the DW before that DW that contains 603 the actual start of rSTR2. */ 604 clrrdi rSTR2, rSTR2, 3 605 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 606/* Compute the left/right shift counts for the unaligned rSTR2, 607 compensating for the logical (DW aligned) start of rSTR1. */ 608 clrldi rSHL, rWORD8_SHIFT, 61 609 clrrdi rSTR1, rSTR1, 3 610 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 611 sldi rSHL, rSHL, 3 612 cmpld cr5, rWORD8_SHIFT, rSTR2 613 add rN, rN, r12 614 sldi rWORD6, r12, 3 615 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 616 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) 617 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) 618 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) 619 subfic rSHR, rSHL, 64 620 srdi r0, rN, 5 /* Divide by 32 */ 621 andi. r12, rN, 24 /* Get the DW remainder */ 622/* We normally need to load 2 DWs to start the unaligned rSTR2, but in 623 this special case those bits may be discarded anyway. Also we 624 must avoid loading a DW where none of the bits are part of rSTR2 as 625 this may cross a page boundary and cause a page fault. */ 626 li rWORD8, 0 627 blt cr5, L(dus0) 628 LD rWORD8, 0, rSTR2 629 addi rSTR2, rSTR2, 8 630 sld rWORD8, rWORD8, rSHL 631 632L(dus0): 633 LD rWORD1, 0, rSTR1 634 LD rWORD2, 0, rSTR2 635 cmpldi cr1, r12, 16 636 cmpldi cr7, rN, 32 637 srd r12, rWORD2, rSHR 638 clrldi rN, rN, 61 639 beq L(duPs4) 640 mtctr r0 641 or rWORD8, r12, rWORD8 642 bgt cr1, L(duPs3) 643 beq cr1, L(duPs2) 644 645/* Remainder is 8 */ 646 .align 4 647L(dusP1): 648 sld rWORD8_SHIFT, rWORD2, rSHL 649 sld rWORD7, rWORD1, rWORD6 650 sld rWORD8, rWORD8, rWORD6 651 bge cr7, L(duP1e) 652/* At this point we exit early with the first double word compare 653 complete and remainder of 0 to 7 bytes. See L(du14) for details on 654 how we handle the remaining bytes. */ 655 cmpld cr5, rWORD7, rWORD8 656 sldi. rN, rN, 3 657 bne cr5, L(duLcr5) 658 cmpld cr7, rN, rSHR 659 beq L(duZeroReturn) 660 li r0, 0 661 ble cr7, L(dutrim) 662 LD rWORD2, rOFF8, rSTR2 663 srd r0, rWORD2, rSHR 664 b L(dutrim) 665/* Remainder is 16 */ 666 .align 4 667L(duPs2): 668 sld rWORD6_SHIFT, rWORD2, rSHL 669 sld rWORD5, rWORD1, rWORD6 670 sld rWORD6, rWORD8, rWORD6 671 b L(duP2e) 672/* Remainder is 24 */ 673 .align 4 674L(duPs3): 675 sld rWORD4_SHIFT, rWORD2, rSHL 676 sld rWORD3, rWORD1, rWORD6 677 sld rWORD4, rWORD8, rWORD6 678 b L(duP3e) 679/* Count is a multiple of 32, remainder is 0 */ 680 .align 4 681L(duPs4): 682 mtctr r0 683 or rWORD8, r12, rWORD8 684 sld rWORD2_SHIFT, rWORD2, rSHL 685 sld rWORD1, rWORD1, rWORD6 686 sld rWORD2, rWORD8, rWORD6 687 b L(duP4e) 688 689/* At this point we know rSTR1 is double word aligned and the 690 compare length is at least 8 bytes. */ 691 .align 4 692L(DWunaligned): 693 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 694 clrrdi rSTR2, rSTR2, 3 695 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 696 srdi r0, rN, 5 /* Divide by 32 */ 697 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 698 andi. r12, rN, 24 /* Get the DW remainder */ 699 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 700 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) 701 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) 702 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) 703 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) 704 sldi rSHL, rSHL, 3 705 LD rWORD6, 0, rSTR2 706 LD rWORD8, rOFF8, rSTR2 707 addi rSTR2, rSTR2, 8 708 cmpldi cr1, r12, 16 709 cmpldi cr7, rN, 32 710 clrldi rN, rN, 61 711 subfic rSHR, rSHL, 64 712 sld rWORD6_SHIFT, rWORD6, rSHL 713 beq L(duP4) 714 mtctr r0 715 bgt cr1, L(duP3) 716 beq cr1, L(duP2) 717 718/* Remainder is 8 */ 719 .align 4 720L(duP1): 721 srd r12, rWORD8, rSHR 722 LD rWORD7, 0, rSTR1 723 sld rWORD8_SHIFT, rWORD8, rSHL 724 or rWORD8, r12, rWORD6_SHIFT 725 blt cr7, L(duP1x) 726L(duP1e): 727 LD rWORD1, rOFF8, rSTR1 728 LD rWORD2, rOFF8, rSTR2 729 cmpld cr5, rWORD7, rWORD8 730 srd r0, rWORD2, rSHR 731 sld rWORD2_SHIFT, rWORD2, rSHL 732 or rWORD2, r0, rWORD8_SHIFT 733 LD rWORD3, rOFF16, rSTR1 734 LD rWORD4, rOFF16, rSTR2 735 cmpld cr7, rWORD1, rWORD2 736 srd r12, rWORD4, rSHR 737 sld rWORD4_SHIFT, rWORD4, rSHL 738 bne cr5, L(duLcr5) 739 or rWORD4, r12, rWORD2_SHIFT 740 LD rWORD5, rOFF24, rSTR1 741 LD rWORD6, rOFF24, rSTR2 742 cmpld cr1, rWORD3, rWORD4 743 srd r0, rWORD6, rSHR 744 sld rWORD6_SHIFT, rWORD6, rSHL 745 bne cr7, L(duLcr7) 746 or rWORD6, r0, rWORD4_SHIFT 747 cmpld cr6, rWORD5, rWORD6 748 b L(duLoop3) 749 .align 4 750/* At this point we exit early with the first double word compare 751 complete and remainder of 0 to 7 bytes. See L(du14) for details on 752 how we handle the remaining bytes. */ 753L(duP1x): 754 cmpld cr5, rWORD7, rWORD8 755 sldi. rN, rN, 3 756 bne cr5, L(duLcr5) 757 cmpld cr7, rN, rSHR 758 beq L(duZeroReturn) 759 li r0, 0 760 ble cr7, L(dutrim) 761 LD rWORD2, rOFF8, rSTR2 762 srd r0, rWORD2, rSHR 763 b L(dutrim) 764/* Remainder is 16 */ 765 .align 4 766L(duP2): 767 srd r0, rWORD8, rSHR 768 LD rWORD5, 0, rSTR1 769 or rWORD6, r0, rWORD6_SHIFT 770 sld rWORD6_SHIFT, rWORD8, rSHL 771L(duP2e): 772 LD rWORD7, rOFF8, rSTR1 773 LD rWORD8, rOFF8, rSTR2 774 cmpld cr6, rWORD5, rWORD6 775 srd r12, rWORD8, rSHR 776 sld rWORD8_SHIFT, rWORD8, rSHL 777 or rWORD8, r12, rWORD6_SHIFT 778 blt cr7, L(duP2x) 779 LD rWORD1, rOFF16, rSTR1 780 LD rWORD2, rOFF16, rSTR2 781 cmpld cr5, rWORD7, rWORD8 782 bne cr6, L(duLcr6) 783 srd r0, rWORD2, rSHR 784 sld rWORD2_SHIFT, rWORD2, rSHL 785 or rWORD2, r0, rWORD8_SHIFT 786 LD rWORD3, rOFF24, rSTR1 787 LD rWORD4, rOFF24, rSTR2 788 cmpld cr7, rWORD1, rWORD2 789 bne cr5, L(duLcr5) 790 srd r12, rWORD4, rSHR 791 sld rWORD4_SHIFT, rWORD4, rSHL 792 or rWORD4, r12, rWORD2_SHIFT 793 addi rSTR1, rSTR1, 8 794 addi rSTR2, rSTR2, 8 795 cmpld cr1, rWORD3, rWORD4 796 b L(duLoop2) 797 .align 4 798L(duP2x): 799 cmpld cr5, rWORD7, rWORD8 800 addi rSTR1, rSTR1, 8 801 addi rSTR2, rSTR2, 8 802 bne cr6, L(duLcr6) 803 sldi. rN, rN, 3 804 bne cr5, L(duLcr5) 805 cmpld cr7, rN, rSHR 806 beq L(duZeroReturn) 807 li r0, 0 808 ble cr7, L(dutrim) 809 LD rWORD2, rOFF8, rSTR2 810 srd r0, rWORD2, rSHR 811 b L(dutrim) 812 813/* Remainder is 24 */ 814 .align 4 815L(duP3): 816 srd r12, rWORD8, rSHR 817 LD rWORD3, 0, rSTR1 818 sld rWORD4_SHIFT, rWORD8, rSHL 819 or rWORD4, r12, rWORD6_SHIFT 820L(duP3e): 821 LD rWORD5, rOFF8, rSTR1 822 LD rWORD6, rOFF8, rSTR2 823 cmpld cr1, rWORD3, rWORD4 824 srd r0, rWORD6, rSHR 825 sld rWORD6_SHIFT, rWORD6, rSHL 826 or rWORD6, r0, rWORD4_SHIFT 827 LD rWORD7, rOFF16, rSTR1 828 LD rWORD8, rOFF16, rSTR2 829 cmpld cr6, rWORD5, rWORD6 830 bne cr1, L(duLcr1) 831 srd r12, rWORD8, rSHR 832 sld rWORD8_SHIFT, rWORD8, rSHL 833 or rWORD8, r12, rWORD6_SHIFT 834 blt cr7, L(duP3x) 835 LD rWORD1, rOFF24, rSTR1 836 LD rWORD2, rOFF24, rSTR2 837 cmpld cr5, rWORD7, rWORD8 838 bne cr6, L(duLcr6) 839 srd r0, rWORD2, rSHR 840 sld rWORD2_SHIFT, rWORD2, rSHL 841 or rWORD2, r0, rWORD8_SHIFT 842 addi rSTR1, rSTR1, 16 843 addi rSTR2, rSTR2, 16 844 cmpld cr7, rWORD1, rWORD2 845 b L(duLoop1) 846 .align 4 847L(duP3x): 848 addi rSTR1, rSTR1, 16 849 addi rSTR2, rSTR2, 16 850 cmpld cr5, rWORD7, rWORD8 851 bne cr6, L(duLcr6) 852 sldi. rN, rN, 3 853 bne cr5, L(duLcr5) 854 cmpld cr7, rN, rSHR 855 beq L(duZeroReturn) 856 li r0, 0 857 ble cr7, L(dutrim) 858 LD rWORD2, rOFF8, rSTR2 859 srd r0, rWORD2, rSHR 860 b L(dutrim) 861 862/* Count is a multiple of 32, remainder is 0 */ 863 .align 4 864L(duP4): 865 mtctr r0 866 srd r0, rWORD8, rSHR 867 LD rWORD1, 0, rSTR1 868 sld rWORD2_SHIFT, rWORD8, rSHL 869 or rWORD2, r0, rWORD6_SHIFT 870L(duP4e): 871 LD rWORD3, rOFF8, rSTR1 872 LD rWORD4, rOFF8, rSTR2 873 cmpld cr7, rWORD1, rWORD2 874 srd r12, rWORD4, rSHR 875 sld rWORD4_SHIFT, rWORD4, rSHL 876 or rWORD4, r12, rWORD2_SHIFT 877 LD rWORD5, rOFF16, rSTR1 878 LD rWORD6, rOFF16, rSTR2 879 cmpld cr1, rWORD3, rWORD4 880 bne cr7, L(duLcr7) 881 srd r0, rWORD6, rSHR 882 sld rWORD6_SHIFT, rWORD6, rSHL 883 or rWORD6, r0, rWORD4_SHIFT 884 LD rWORD7, rOFF24, rSTR1 885 LD rWORD8, rOFF24, rSTR2 886 addi rSTR1, rSTR1, 24 887 addi rSTR2, rSTR2, 24 888 cmpld cr6, rWORD5, rWORD6 889 bne cr1, L(duLcr1) 890 srd r12, rWORD8, rSHR 891 sld rWORD8_SHIFT, rWORD8, rSHL 892 or rWORD8, r12, rWORD6_SHIFT 893 cmpld cr5, rWORD7, rWORD8 894 bdz L(du24) /* Adjust CTR as we start with +4 */ 895/* This is the primary loop */ 896 .align 4 897L(duLoop): 898 LD rWORD1, rOFF8, rSTR1 899 LD rWORD2, rOFF8, rSTR2 900 cmpld cr1, rWORD3, rWORD4 901 bne cr6, L(duLcr6) 902 srd r0, rWORD2, rSHR 903 sld rWORD2_SHIFT, rWORD2, rSHL 904 or rWORD2, r0, rWORD8_SHIFT 905L(duLoop1): 906 LD rWORD3, rOFF16, rSTR1 907 LD rWORD4, rOFF16, rSTR2 908 cmpld cr6, rWORD5, rWORD6 909 bne cr5, L(duLcr5) 910 srd r12, rWORD4, rSHR 911 sld rWORD4_SHIFT, rWORD4, rSHL 912 or rWORD4, r12, rWORD2_SHIFT 913L(duLoop2): 914 LD rWORD5, rOFF24, rSTR1 915 LD rWORD6, rOFF24, rSTR2 916 cmpld cr5, rWORD7, rWORD8 917 bne cr7, L(duLcr7) 918 srd r0, rWORD6, rSHR 919 sld rWORD6_SHIFT, rWORD6, rSHL 920 or rWORD6, r0, rWORD4_SHIFT 921L(duLoop3): 922 LD rWORD7, rOFF32, rSTR1 923 LD rWORD8, rOFF32, rSTR2 924 addi rSTR1, rSTR1, 32 925 addi rSTR2, rSTR2, 32 926 cmpld cr7, rWORD1, rWORD2 927 bne cr1, L(duLcr1) 928 srd r12, rWORD8, rSHR 929 sld rWORD8_SHIFT, rWORD8, rSHL 930 or rWORD8, r12, rWORD6_SHIFT 931 bdnz L(duLoop) 932 933L(duL4): 934 cmpld cr1, rWORD3, rWORD4 935 bne cr6, L(duLcr6) 936 cmpld cr6, rWORD5, rWORD6 937 bne cr5, L(duLcr5) 938 cmpld cr5, rWORD7, rWORD8 939L(du44): 940 bne cr7, L(duLcr7) 941L(du34): 942 bne cr1, L(duLcr1) 943L(du24): 944 bne cr6, L(duLcr6) 945L(du14): 946 sldi. rN, rN, 3 947 bne cr5, L(duLcr5) 948/* At this point we have a remainder of 1 to 7 bytes to compare. We use 949 shift right double to eliminate bits beyond the compare length. 950 951 However it may not be safe to load rWORD2 which may be beyond the 952 string length. So we compare the bit length of the remainder to 953 the right shift count (rSHR). If the bit count is less than or equal 954 we do not need to load rWORD2 (all significant bits are already in 955 rWORD8_SHIFT). */ 956 cmpld cr7, rN, rSHR 957 beq L(duZeroReturn) 958 li r0, 0 959 ble cr7, L(dutrim) 960 LD rWORD2, rOFF8, rSTR2 961 srd r0, rWORD2, rSHR 962 .align 4 963L(dutrim): 964 LD rWORD1, rOFF8, rSTR1 965 ld rWORD8, -8(r1) 966 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ 967 or rWORD2, r0, rWORD8_SHIFT 968 ld rWORD7, rWORD7SAVE(r1) 969 ld rSHL, rSHLSAVE(r1) 970 srd rWORD1, rWORD1, rN 971 srd rWORD2, rWORD2, rN 972 ld rSHR, rSHRSAVE(r1) 973 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 974 li rRTN, 0 975 cmpld cr7, rWORD1, rWORD2 976 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 977 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 978 beq cr7, L(dureturn24) 979 li rRTN, 1 980 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 981 ld rOFF8, rOFF8SAVE(r1) 982 ld rOFF16, rOFF16SAVE(r1) 983 ld rOFF24, rOFF24SAVE(r1) 984 ld rOFF32, rOFF32SAVE(r1) 985 bgtlr cr7 986 li rRTN, -1 987 blr 988 .align 4 989L(duLcr7): 990 ld rWORD8, rWORD8SAVE(r1) 991 ld rWORD7, rWORD7SAVE(r1) 992 li rRTN, 1 993 bgt cr7, L(dureturn29) 994 ld rSHL, rSHLSAVE(r1) 995 ld rSHR, rSHRSAVE(r1) 996 li rRTN, -1 997 b L(dureturn27) 998 .align 4 999L(duLcr1): 1000 ld rWORD8, rWORD8SAVE(r1) 1001 ld rWORD7, rWORD7SAVE(r1) 1002 li rRTN, 1 1003 bgt cr1, L(dureturn29) 1004 ld rSHL, rSHLSAVE(r1) 1005 ld rSHR, rSHRSAVE(r1) 1006 li rRTN, -1 1007 b L(dureturn27) 1008 .align 4 1009L(duLcr6): 1010 ld rWORD8, rWORD8SAVE(r1) 1011 ld rWORD7, rWORD7SAVE(r1) 1012 li rRTN, 1 1013 bgt cr6, L(dureturn29) 1014 ld rSHL, rSHLSAVE(r1) 1015 ld rSHR, rSHRSAVE(r1) 1016 li rRTN, -1 1017 b L(dureturn27) 1018 .align 4 1019L(duLcr5): 1020 ld rWORD8, rWORD8SAVE(r1) 1021 ld rWORD7, rWORD7SAVE(r1) 1022 li rRTN, 1 1023 bgt cr5, L(dureturn29) 1024 ld rSHL, rSHLSAVE(r1) 1025 ld rSHR, rSHRSAVE(r1) 1026 li rRTN, -1 1027 b L(dureturn27) 1028 1029 .align 3 1030L(duZeroReturn): 1031 li rRTN, 0 1032 .align 4 1033L(dureturn): 1034 ld rWORD8, rWORD8SAVE(r1) 1035 ld rWORD7, rWORD7SAVE(r1) 1036L(dureturn29): 1037 ld rSHL, rSHLSAVE(r1) 1038 ld rSHR, rSHRSAVE(r1) 1039L(dureturn27): 1040 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) 1041 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) 1042 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) 1043L(dureturn24): 1044 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) 1045 ld rOFF8, rOFF8SAVE(r1) 1046 ld rOFF16, rOFF16SAVE(r1) 1047 ld rOFF24, rOFF24SAVE(r1) 1048 ld rOFF32, rOFF32SAVE(r1) 1049 blr 1050 1051L(duzeroLength): 1052 ld rOFF8, rOFF8SAVE(r1) 1053 ld rOFF16, rOFF16SAVE(r1) 1054 ld rOFF24, rOFF24SAVE(r1) 1055 ld rOFF32, rOFF32SAVE(r1) 1056 li rRTN, 0 1057 blr 1058 1059END (MEMCMP) 1060libc_hidden_builtin_def (memcmp) 1061weak_alias (memcmp, bcmp) 1062strong_alias (memcmp, __memcmpeq) 1063libc_hidden_def (__memcmpeq) 1064