1/* memcmp with SSE4.1, wmemcmp with SSE4.1 2 Copyright (C) 2010-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef MEMCMP 24# define MEMCMP __memcmp_sse4_1 25# endif 26 27#ifdef USE_AS_WMEMCMP 28# define CMPEQ pcmpeqd 29# define CHAR_SIZE 4 30#else 31# define CMPEQ pcmpeqb 32# define CHAR_SIZE 1 33#endif 34 35 36/* Warning! 37 wmemcmp has to use SIGNED comparison for elements. 38 memcmp has to use UNSIGNED comparison for elemnts. 39*/ 40 41 .section .text.sse4.1,"ax",@progbits 42ENTRY (MEMCMP) 43# ifdef USE_AS_WMEMCMP 44 shl $2, %RDX_LP 45# elif defined __ILP32__ 46 /* Clear the upper 32 bits. */ 47 mov %edx, %edx 48# endif 49 cmp $79, %RDX_LP 50 ja L(79bytesormore) 51 52 cmp $CHAR_SIZE, %RDX_LP 53 jbe L(firstbyte) 54 55 /* N in (CHAR_SIZE, 79) bytes. */ 56 cmpl $32, %edx 57 ja L(more_32_bytes) 58 59 cmpl $16, %edx 60 jae L(16_to_32_bytes) 61 62# ifndef USE_AS_WMEMCMP 63 cmpl $8, %edx 64 jae L(8_to_16_bytes) 65 66 cmpl $4, %edx 67 jb L(2_to_3_bytes) 68 69 movl (%rdi), %eax 70 movl (%rsi), %ecx 71 72 bswap %eax 73 bswap %ecx 74 75 shlq $32, %rax 76 shlq $32, %rcx 77 78 movl -4(%rdi, %rdx), %edi 79 movl -4(%rsi, %rdx), %esi 80 81 bswap %edi 82 bswap %esi 83 84 orq %rdi, %rax 85 orq %rsi, %rcx 86 subq %rcx, %rax 87 cmovne %edx, %eax 88 sbbl %ecx, %ecx 89 orl %ecx, %eax 90 ret 91 92 .p2align 4,, 8 93L(2_to_3_bytes): 94 movzwl (%rdi), %eax 95 movzwl (%rsi), %ecx 96 shll $8, %eax 97 shll $8, %ecx 98 bswap %eax 99 bswap %ecx 100 movzbl -1(%rdi, %rdx), %edi 101 movzbl -1(%rsi, %rdx), %esi 102 orl %edi, %eax 103 orl %esi, %ecx 104 subl %ecx, %eax 105 ret 106 107 .p2align 4,, 8 108L(8_to_16_bytes): 109 movq (%rdi), %rax 110 movq (%rsi), %rcx 111 112 bswap %rax 113 bswap %rcx 114 115 subq %rcx, %rax 116 jne L(8_to_16_bytes_done) 117 118 movq -8(%rdi, %rdx), %rax 119 movq -8(%rsi, %rdx), %rcx 120 121 bswap %rax 122 bswap %rcx 123 124 subq %rcx, %rax 125 126L(8_to_16_bytes_done): 127 cmovne %edx, %eax 128 sbbl %ecx, %ecx 129 orl %ecx, %eax 130 ret 131# else 132 xorl %eax, %eax 133 movl (%rdi), %ecx 134 cmpl (%rsi), %ecx 135 jne L(8_to_16_bytes_done) 136 movl 4(%rdi), %ecx 137 cmpl 4(%rsi), %ecx 138 jne L(8_to_16_bytes_done) 139 movl -4(%rdi, %rdx), %ecx 140 cmpl -4(%rsi, %rdx), %ecx 141 jne L(8_to_16_bytes_done) 142 ret 143# endif 144 145 .p2align 4,, 3 146L(ret_zero): 147 xorl %eax, %eax 148L(zero): 149 ret 150 151 .p2align 4,, 8 152L(firstbyte): 153 jb L(ret_zero) 154# ifdef USE_AS_WMEMCMP 155 xorl %eax, %eax 156 movl (%rdi), %ecx 157 cmpl (%rsi), %ecx 158 je L(zero) 159L(8_to_16_bytes_done): 160 setg %al 161 leal -1(%rax, %rax), %eax 162# else 163 movzbl (%rdi), %eax 164 movzbl (%rsi), %ecx 165 sub %ecx, %eax 166# endif 167 ret 168 169 .p2align 4 170L(vec_return_begin_48): 171 addq $16, %rdi 172 addq $16, %rsi 173L(vec_return_begin_32): 174 bsfl %eax, %eax 175# ifdef USE_AS_WMEMCMP 176 movl 32(%rdi, %rax), %ecx 177 xorl %edx, %edx 178 cmpl 32(%rsi, %rax), %ecx 179 setg %dl 180 leal -1(%rdx, %rdx), %eax 181# else 182 movzbl 32(%rsi, %rax), %ecx 183 movzbl 32(%rdi, %rax), %eax 184 subl %ecx, %eax 185# endif 186 ret 187 188 .p2align 4 189L(vec_return_begin_16): 190 addq $16, %rdi 191 addq $16, %rsi 192L(vec_return_begin): 193 bsfl %eax, %eax 194# ifdef USE_AS_WMEMCMP 195 movl (%rdi, %rax), %ecx 196 xorl %edx, %edx 197 cmpl (%rsi, %rax), %ecx 198 setg %dl 199 leal -1(%rdx, %rdx), %eax 200# else 201 movzbl (%rsi, %rax), %ecx 202 movzbl (%rdi, %rax), %eax 203 subl %ecx, %eax 204# endif 205 ret 206 207 .p2align 4 208L(vec_return_end_16): 209 subl $16, %edx 210L(vec_return_end): 211 bsfl %eax, %eax 212 addl %edx, %eax 213# ifdef USE_AS_WMEMCMP 214 movl -16(%rdi, %rax), %ecx 215 xorl %edx, %edx 216 cmpl -16(%rsi, %rax), %ecx 217 setg %dl 218 leal -1(%rdx, %rdx), %eax 219# else 220 movzbl -16(%rsi, %rax), %ecx 221 movzbl -16(%rdi, %rax), %eax 222 subl %ecx, %eax 223# endif 224 ret 225 226 .p2align 4,, 8 227L(more_32_bytes): 228 movdqu (%rdi), %xmm0 229 movdqu (%rsi), %xmm1 230 CMPEQ %xmm0, %xmm1 231 pmovmskb %xmm1, %eax 232 incw %ax 233 jnz L(vec_return_begin) 234 235 movdqu 16(%rdi), %xmm0 236 movdqu 16(%rsi), %xmm1 237 CMPEQ %xmm0, %xmm1 238 pmovmskb %xmm1, %eax 239 incw %ax 240 jnz L(vec_return_begin_16) 241 242 cmpl $64, %edx 243 jbe L(32_to_64_bytes) 244 movdqu 32(%rdi), %xmm0 245 movdqu 32(%rsi), %xmm1 246 CMPEQ %xmm0, %xmm1 247 pmovmskb %xmm1, %eax 248 incw %ax 249 jnz L(vec_return_begin_32) 250 251 .p2align 4,, 6 252L(32_to_64_bytes): 253 movdqu -32(%rdi, %rdx), %xmm0 254 movdqu -32(%rsi, %rdx), %xmm1 255 CMPEQ %xmm0, %xmm1 256 pmovmskb %xmm1, %eax 257 incw %ax 258 jnz L(vec_return_end_16) 259 260 movdqu -16(%rdi, %rdx), %xmm0 261 movdqu -16(%rsi, %rdx), %xmm1 262 CMPEQ %xmm0, %xmm1 263 pmovmskb %xmm1, %eax 264 incw %ax 265 jnz L(vec_return_end) 266 ret 267 268 .p2align 4 269L(16_to_32_bytes): 270 movdqu (%rdi), %xmm0 271 movdqu (%rsi), %xmm1 272 CMPEQ %xmm0, %xmm1 273 pmovmskb %xmm1, %eax 274 incw %ax 275 jnz L(vec_return_begin) 276 277 movdqu -16(%rdi, %rdx), %xmm0 278 movdqu -16(%rsi, %rdx), %xmm1 279 CMPEQ %xmm0, %xmm1 280 pmovmskb %xmm1, %eax 281 incw %ax 282 jnz L(vec_return_end) 283 ret 284 285 286 .p2align 4 287L(79bytesormore): 288 movdqu (%rdi), %xmm0 289 movdqu (%rsi), %xmm1 290 CMPEQ %xmm0, %xmm1 291 pmovmskb %xmm1, %eax 292 incw %ax 293 jnz L(vec_return_begin) 294 295 296 mov %rsi, %rcx 297 and $-16, %rsi 298 add $16, %rsi 299 sub %rsi, %rcx 300 301 sub %rcx, %rdi 302 add %rcx, %rdx 303 test $0xf, %rdi 304 jz L(2aligned) 305 306 cmp $128, %rdx 307 ja L(128bytesormore) 308 309 .p2align 4,, 6 310L(less128bytes): 311 movdqu (%rdi), %xmm1 312 CMPEQ (%rsi), %xmm1 313 pmovmskb %xmm1, %eax 314 incw %ax 315 jnz L(vec_return_begin) 316 317 movdqu 16(%rdi), %xmm1 318 CMPEQ 16(%rsi), %xmm1 319 pmovmskb %xmm1, %eax 320 incw %ax 321 jnz L(vec_return_begin_16) 322 323 movdqu 32(%rdi), %xmm1 324 CMPEQ 32(%rsi), %xmm1 325 pmovmskb %xmm1, %eax 326 incw %ax 327 jnz L(vec_return_begin_32) 328 329 movdqu 48(%rdi), %xmm1 330 CMPEQ 48(%rsi), %xmm1 331 pmovmskb %xmm1, %eax 332 incw %ax 333 jnz L(vec_return_begin_48) 334 335 cmp $96, %rdx 336 jb L(32_to_64_bytes) 337 338 addq $64, %rdi 339 addq $64, %rsi 340 subq $64, %rdx 341 342 .p2align 4,, 6 343L(last_64_bytes): 344 movdqu (%rdi), %xmm1 345 CMPEQ (%rsi), %xmm1 346 pmovmskb %xmm1, %eax 347 incw %ax 348 jnz L(vec_return_begin) 349 350 movdqu 16(%rdi), %xmm1 351 CMPEQ 16(%rsi), %xmm1 352 pmovmskb %xmm1, %eax 353 incw %ax 354 jnz L(vec_return_begin_16) 355 356 movdqu -32(%rdi, %rdx), %xmm0 357 movdqu -32(%rsi, %rdx), %xmm1 358 CMPEQ %xmm0, %xmm1 359 pmovmskb %xmm1, %eax 360 incw %ax 361 jnz L(vec_return_end_16) 362 363 movdqu -16(%rdi, %rdx), %xmm0 364 movdqu -16(%rsi, %rdx), %xmm1 365 CMPEQ %xmm0, %xmm1 366 pmovmskb %xmm1, %eax 367 incw %ax 368 jnz L(vec_return_end) 369 ret 370 371 .p2align 4 372L(128bytesormore): 373 cmp $256, %rdx 374 ja L(unaligned_loop) 375L(less256bytes): 376 movdqu (%rdi), %xmm1 377 CMPEQ (%rsi), %xmm1 378 pmovmskb %xmm1, %eax 379 incw %ax 380 jnz L(vec_return_begin) 381 382 movdqu 16(%rdi), %xmm1 383 CMPEQ 16(%rsi), %xmm1 384 pmovmskb %xmm1, %eax 385 incw %ax 386 jnz L(vec_return_begin_16) 387 388 movdqu 32(%rdi), %xmm1 389 CMPEQ 32(%rsi), %xmm1 390 pmovmskb %xmm1, %eax 391 incw %ax 392 jnz L(vec_return_begin_32) 393 394 movdqu 48(%rdi), %xmm1 395 CMPEQ 48(%rsi), %xmm1 396 pmovmskb %xmm1, %eax 397 incw %ax 398 jnz L(vec_return_begin_48) 399 400 addq $64, %rdi 401 addq $64, %rsi 402 403 movdqu (%rdi), %xmm1 404 CMPEQ (%rsi), %xmm1 405 pmovmskb %xmm1, %eax 406 incw %ax 407 jnz L(vec_return_begin) 408 409 movdqu 16(%rdi), %xmm1 410 CMPEQ 16(%rsi), %xmm1 411 pmovmskb %xmm1, %eax 412 incw %ax 413 jnz L(vec_return_begin_16) 414 415 movdqu 32(%rdi), %xmm1 416 CMPEQ 32(%rsi), %xmm1 417 pmovmskb %xmm1, %eax 418 incw %ax 419 jnz L(vec_return_begin_32) 420 421 movdqu 48(%rdi), %xmm1 422 CMPEQ 48(%rsi), %xmm1 423 pmovmskb %xmm1, %eax 424 incw %ax 425 jnz L(vec_return_begin_48) 426 427 addq $-128, %rdx 428 subq $-64, %rsi 429 subq $-64, %rdi 430 431 cmp $64, %rdx 432 ja L(less128bytes) 433 434 cmp $32, %rdx 435 ja L(last_64_bytes) 436 437 movdqu -32(%rdi, %rdx), %xmm0 438 movdqu -32(%rsi, %rdx), %xmm1 439 CMPEQ %xmm0, %xmm1 440 pmovmskb %xmm1, %eax 441 incw %ax 442 jnz L(vec_return_end_16) 443 444 movdqu -16(%rdi, %rdx), %xmm0 445 movdqu -16(%rsi, %rdx), %xmm1 446 CMPEQ %xmm0, %xmm1 447 pmovmskb %xmm1, %eax 448 incw %ax 449 jnz L(vec_return_end) 450 ret 451 452 .p2align 4 453L(unaligned_loop): 454# ifdef DATA_CACHE_SIZE_HALF 455 mov $DATA_CACHE_SIZE_HALF, %R8_LP 456# else 457 mov __x86_data_cache_size_half(%rip), %R8_LP 458# endif 459 movq %r8, %r9 460 addq %r8, %r8 461 addq %r9, %r8 462 cmpq %r8, %rdx 463 ja L(L2_L3_cache_unaligned) 464 sub $64, %rdx 465 .p2align 4 466L(64bytesormore_loop): 467 movdqu (%rdi), %xmm0 468 movdqu 16(%rdi), %xmm1 469 movdqu 32(%rdi), %xmm2 470 movdqu 48(%rdi), %xmm3 471 472 CMPEQ (%rsi), %xmm0 473 CMPEQ 16(%rsi), %xmm1 474 CMPEQ 32(%rsi), %xmm2 475 CMPEQ 48(%rsi), %xmm3 476 477 pand %xmm0, %xmm1 478 pand %xmm2, %xmm3 479 pand %xmm1, %xmm3 480 481 pmovmskb %xmm3, %eax 482 incw %ax 483 jnz L(64bytesormore_loop_end) 484 485 add $64, %rsi 486 add $64, %rdi 487 sub $64, %rdx 488 ja L(64bytesormore_loop) 489 490 .p2align 4,, 6 491L(loop_tail): 492 addq %rdx, %rdi 493 movdqu (%rdi), %xmm0 494 movdqu 16(%rdi), %xmm1 495 movdqu 32(%rdi), %xmm2 496 movdqu 48(%rdi), %xmm3 497 498 addq %rdx, %rsi 499 movdqu (%rsi), %xmm4 500 movdqu 16(%rsi), %xmm5 501 movdqu 32(%rsi), %xmm6 502 movdqu 48(%rsi), %xmm7 503 504 CMPEQ %xmm4, %xmm0 505 CMPEQ %xmm5, %xmm1 506 CMPEQ %xmm6, %xmm2 507 CMPEQ %xmm7, %xmm3 508 509 pand %xmm0, %xmm1 510 pand %xmm2, %xmm3 511 pand %xmm1, %xmm3 512 513 pmovmskb %xmm3, %eax 514 incw %ax 515 jnz L(64bytesormore_loop_end) 516 ret 517 518L(L2_L3_cache_unaligned): 519 subq $64, %rdx 520 .p2align 4 521L(L2_L3_unaligned_128bytes_loop): 522 prefetchnta 0x1c0(%rdi) 523 prefetchnta 0x1c0(%rsi) 524 525 movdqu (%rdi), %xmm0 526 movdqu 16(%rdi), %xmm1 527 movdqu 32(%rdi), %xmm2 528 movdqu 48(%rdi), %xmm3 529 530 CMPEQ (%rsi), %xmm0 531 CMPEQ 16(%rsi), %xmm1 532 CMPEQ 32(%rsi), %xmm2 533 CMPEQ 48(%rsi), %xmm3 534 535 pand %xmm0, %xmm1 536 pand %xmm2, %xmm3 537 pand %xmm1, %xmm3 538 539 pmovmskb %xmm3, %eax 540 incw %ax 541 jnz L(64bytesormore_loop_end) 542 543 add $64, %rsi 544 add $64, %rdi 545 sub $64, %rdx 546 ja L(L2_L3_unaligned_128bytes_loop) 547 jmp L(loop_tail) 548 549 550 /* This case is for machines which are sensitive for unaligned 551 * instructions. */ 552 .p2align 4 553L(2aligned): 554 cmp $128, %rdx 555 ja L(128bytesormorein2aligned) 556L(less128bytesin2aligned): 557 movdqa (%rdi), %xmm1 558 CMPEQ (%rsi), %xmm1 559 pmovmskb %xmm1, %eax 560 incw %ax 561 jnz L(vec_return_begin) 562 563 movdqa 16(%rdi), %xmm1 564 CMPEQ 16(%rsi), %xmm1 565 pmovmskb %xmm1, %eax 566 incw %ax 567 jnz L(vec_return_begin_16) 568 569 movdqa 32(%rdi), %xmm1 570 CMPEQ 32(%rsi), %xmm1 571 pmovmskb %xmm1, %eax 572 incw %ax 573 jnz L(vec_return_begin_32) 574 575 movdqa 48(%rdi), %xmm1 576 CMPEQ 48(%rsi), %xmm1 577 pmovmskb %xmm1, %eax 578 incw %ax 579 jnz L(vec_return_begin_48) 580 581 cmp $96, %rdx 582 jb L(32_to_64_bytes) 583 584 addq $64, %rdi 585 addq $64, %rsi 586 subq $64, %rdx 587 588 .p2align 4,, 6 589L(aligned_last_64_bytes): 590 movdqa (%rdi), %xmm1 591 CMPEQ (%rsi), %xmm1 592 pmovmskb %xmm1, %eax 593 incw %ax 594 jnz L(vec_return_begin) 595 596 movdqa 16(%rdi), %xmm1 597 CMPEQ 16(%rsi), %xmm1 598 pmovmskb %xmm1, %eax 599 incw %ax 600 jnz L(vec_return_begin_16) 601 602 movdqu -32(%rdi, %rdx), %xmm0 603 movdqu -32(%rsi, %rdx), %xmm1 604 CMPEQ %xmm0, %xmm1 605 pmovmskb %xmm1, %eax 606 incw %ax 607 jnz L(vec_return_end_16) 608 609 movdqu -16(%rdi, %rdx), %xmm0 610 movdqu -16(%rsi, %rdx), %xmm1 611 CMPEQ %xmm0, %xmm1 612 pmovmskb %xmm1, %eax 613 incw %ax 614 jnz L(vec_return_end) 615 ret 616 617 .p2align 4 618L(128bytesormorein2aligned): 619 cmp $256, %rdx 620 ja L(aligned_loop) 621L(less256bytesin2alinged): 622 movdqa (%rdi), %xmm1 623 CMPEQ (%rsi), %xmm1 624 pmovmskb %xmm1, %eax 625 incw %ax 626 jnz L(vec_return_begin) 627 628 movdqa 16(%rdi), %xmm1 629 CMPEQ 16(%rsi), %xmm1 630 pmovmskb %xmm1, %eax 631 incw %ax 632 jnz L(vec_return_begin_16) 633 634 movdqa 32(%rdi), %xmm1 635 CMPEQ 32(%rsi), %xmm1 636 pmovmskb %xmm1, %eax 637 incw %ax 638 jnz L(vec_return_begin_32) 639 640 movdqa 48(%rdi), %xmm1 641 CMPEQ 48(%rsi), %xmm1 642 pmovmskb %xmm1, %eax 643 incw %ax 644 jnz L(vec_return_begin_48) 645 646 addq $64, %rdi 647 addq $64, %rsi 648 649 movdqa (%rdi), %xmm1 650 CMPEQ (%rsi), %xmm1 651 pmovmskb %xmm1, %eax 652 incw %ax 653 jnz L(vec_return_begin) 654 655 movdqa 16(%rdi), %xmm1 656 CMPEQ 16(%rsi), %xmm1 657 pmovmskb %xmm1, %eax 658 incw %ax 659 jnz L(vec_return_begin_16) 660 661 movdqa 32(%rdi), %xmm1 662 CMPEQ 32(%rsi), %xmm1 663 pmovmskb %xmm1, %eax 664 incw %ax 665 jnz L(vec_return_begin_32) 666 667 movdqa 48(%rdi), %xmm1 668 CMPEQ 48(%rsi), %xmm1 669 pmovmskb %xmm1, %eax 670 incw %ax 671 jnz L(vec_return_begin_48) 672 673 addq $-128, %rdx 674 subq $-64, %rsi 675 subq $-64, %rdi 676 677 cmp $64, %rdx 678 ja L(less128bytesin2aligned) 679 680 cmp $32, %rdx 681 ja L(aligned_last_64_bytes) 682 683 movdqu -32(%rdi, %rdx), %xmm0 684 movdqu -32(%rsi, %rdx), %xmm1 685 CMPEQ %xmm0, %xmm1 686 pmovmskb %xmm1, %eax 687 incw %ax 688 jnz L(vec_return_end_16) 689 690 movdqu -16(%rdi, %rdx), %xmm0 691 movdqu -16(%rsi, %rdx), %xmm1 692 CMPEQ %xmm0, %xmm1 693 pmovmskb %xmm1, %eax 694 incw %ax 695 jnz L(vec_return_end) 696 ret 697 698 .p2align 4 699L(aligned_loop): 700# ifdef DATA_CACHE_SIZE_HALF 701 mov $DATA_CACHE_SIZE_HALF, %R8_LP 702# else 703 mov __x86_data_cache_size_half(%rip), %R8_LP 704# endif 705 movq %r8, %r9 706 addq %r8, %r8 707 addq %r9, %r8 708 cmpq %r8, %rdx 709 ja L(L2_L3_cache_aligned) 710 711 sub $64, %rdx 712 .p2align 4 713L(64bytesormore_loopin2aligned): 714 movdqa (%rdi), %xmm0 715 movdqa 16(%rdi), %xmm1 716 movdqa 32(%rdi), %xmm2 717 movdqa 48(%rdi), %xmm3 718 719 CMPEQ (%rsi), %xmm0 720 CMPEQ 16(%rsi), %xmm1 721 CMPEQ 32(%rsi), %xmm2 722 CMPEQ 48(%rsi), %xmm3 723 724 pand %xmm0, %xmm1 725 pand %xmm2, %xmm3 726 pand %xmm1, %xmm3 727 728 pmovmskb %xmm3, %eax 729 incw %ax 730 jnz L(64bytesormore_loop_end) 731 add $64, %rsi 732 add $64, %rdi 733 sub $64, %rdx 734 ja L(64bytesormore_loopin2aligned) 735 jmp L(loop_tail) 736 737L(L2_L3_cache_aligned): 738 subq $64, %rdx 739 .p2align 4 740L(L2_L3_aligned_128bytes_loop): 741 prefetchnta 0x1c0(%rdi) 742 prefetchnta 0x1c0(%rsi) 743 movdqa (%rdi), %xmm0 744 movdqa 16(%rdi), %xmm1 745 movdqa 32(%rdi), %xmm2 746 movdqa 48(%rdi), %xmm3 747 748 CMPEQ (%rsi), %xmm0 749 CMPEQ 16(%rsi), %xmm1 750 CMPEQ 32(%rsi), %xmm2 751 CMPEQ 48(%rsi), %xmm3 752 753 pand %xmm0, %xmm1 754 pand %xmm2, %xmm3 755 pand %xmm1, %xmm3 756 757 pmovmskb %xmm3, %eax 758 incw %ax 759 jnz L(64bytesormore_loop_end) 760 761 addq $64, %rsi 762 addq $64, %rdi 763 subq $64, %rdx 764 ja L(L2_L3_aligned_128bytes_loop) 765 jmp L(loop_tail) 766 767 .p2align 4 768L(64bytesormore_loop_end): 769 pmovmskb %xmm0, %ecx 770 incw %cx 771 jnz L(loop_end_ret) 772 773 pmovmskb %xmm1, %ecx 774 notw %cx 775 sall $16, %ecx 776 jnz L(loop_end_ret) 777 778 pmovmskb %xmm2, %ecx 779 notw %cx 780 shlq $32, %rcx 781 jnz L(loop_end_ret) 782 783 addq $48, %rdi 784 addq $48, %rsi 785 movq %rax, %rcx 786 787 .p2align 4,, 6 788L(loop_end_ret): 789 bsfq %rcx, %rcx 790# ifdef USE_AS_WMEMCMP 791 movl (%rdi, %rcx), %eax 792 xorl %edx, %edx 793 cmpl (%rsi, %rcx), %eax 794 setg %dl 795 leal -1(%rdx, %rdx), %eax 796# else 797 movzbl (%rdi, %rcx), %eax 798 movzbl (%rsi, %rcx), %ecx 799 subl %ecx, %eax 800# endif 801 ret 802END (MEMCMP) 803#endif 804