1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRCMP 24# define STRCMP __strcmp_evex 25# endif 26 27# define PAGE_SIZE 4096 28 29/* VEC_SIZE = Number of bytes in a ymm register */ 30# define VEC_SIZE 32 31 32/* Shift for dividing by (VEC_SIZE * 4). */ 33# define DIVIDE_BY_VEC_4_SHIFT 7 34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) 35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) 36# endif 37 38# define VMOVU vmovdqu64 39# define VMOVA vmovdqa64 40 41# ifdef USE_AS_WCSCMP 42/* Compare packed dwords. */ 43# define VPCMP vpcmpd 44# define VPMINU vpminud 45# define VPTESTM vptestmd 46# define SHIFT_REG32 r8d 47# define SHIFT_REG64 r8 48/* 1 dword char == 4 bytes. */ 49# define SIZE_OF_CHAR 4 50# else 51/* Compare packed bytes. */ 52# define VPCMP vpcmpb 53# define VPMINU vpminub 54# define VPTESTM vptestmb 55# define SHIFT_REG32 ecx 56# define SHIFT_REG64 rcx 57/* 1 byte char == 1 byte. */ 58# define SIZE_OF_CHAR 1 59# endif 60 61# define XMMZERO xmm16 62# define XMM0 xmm17 63# define XMM1 xmm18 64 65# define YMMZERO ymm16 66# define YMM0 ymm17 67# define YMM1 ymm18 68# define YMM2 ymm19 69# define YMM3 ymm20 70# define YMM4 ymm21 71# define YMM5 ymm22 72# define YMM6 ymm23 73# define YMM7 ymm24 74# define YMM8 ymm25 75# define YMM9 ymm26 76# define YMM10 ymm27 77 78/* Warning! 79 wcscmp/wcsncmp have to use SIGNED comparison for elements. 80 strcmp/strncmp have to use UNSIGNED comparison for elements. 81*/ 82 83/* The main idea of the string comparison (byte or dword) using 256-bit 84 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The 85 latter can be on either packed bytes or dwords depending on 86 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the 87 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 88 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) 89 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd 90 instructions. Main loop (away from from page boundary) compares 4 91 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 92 bytes) on each loop. 93 94 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic 95 is the same as strcmp, except that an a maximum offset is tracked. If 96 the maximum offset is reached before a difference is found, zero is 97 returned. */ 98 99 .section .text.evex,"ax",@progbits 100ENTRY (STRCMP) 101# ifdef USE_AS_STRNCMP 102 /* Check for simple cases (0 or 1) in offset. */ 103 cmp $1, %RDX_LP 104 je L(char0) 105 jb L(zero) 106# ifdef USE_AS_WCSCMP 107 /* Convert units: from wide to byte char. */ 108 shl $2, %RDX_LP 109# endif 110 /* Register %r11 tracks the maximum offset. */ 111 mov %RDX_LP, %R11_LP 112# endif 113 movl %edi, %eax 114 xorl %edx, %edx 115 /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ 116 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 117 orl %esi, %eax 118 andl $(PAGE_SIZE - 1), %eax 119 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax 120 jg L(cross_page) 121 /* Start comparing 4 vectors. */ 122 VMOVU (%rdi), %YMM0 123 124 /* Each bit set in K2 represents a non-null CHAR in YMM0. */ 125 VPTESTM %YMM0, %YMM0, %k2 126 127 /* Each bit cleared in K1 represents a mismatch or a null CHAR 128 in YMM0 and 32 bytes at (%rsi). */ 129 VPCMP $0, (%rsi), %YMM0, %k1{%k2} 130 131 kmovd %k1, %ecx 132# ifdef USE_AS_WCSCMP 133 subl $0xff, %ecx 134# else 135 incl %ecx 136# endif 137 je L(next_3_vectors) 138 tzcntl %ecx, %edx 139# ifdef USE_AS_WCSCMP 140 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 141 sall $2, %edx 142# endif 143# ifdef USE_AS_STRNCMP 144 /* Return 0 if the mismatched index (%rdx) is after the maximum 145 offset (%r11). */ 146 cmpq %r11, %rdx 147 jae L(zero) 148# endif 149# ifdef USE_AS_WCSCMP 150 xorl %eax, %eax 151 movl (%rdi, %rdx), %ecx 152 cmpl (%rsi, %rdx), %ecx 153 je L(return) 154L(wcscmp_return): 155 setl %al 156 negl %eax 157 orl $1, %eax 158L(return): 159# else 160 movzbl (%rdi, %rdx), %eax 161 movzbl (%rsi, %rdx), %edx 162 subl %edx, %eax 163# endif 164 ret 165 166L(return_vec_size): 167 tzcntl %ecx, %edx 168# ifdef USE_AS_WCSCMP 169 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 170 sall $2, %edx 171# endif 172# ifdef USE_AS_STRNCMP 173 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after 174 the maximum offset (%r11). */ 175 addq $VEC_SIZE, %rdx 176 cmpq %r11, %rdx 177 jae L(zero) 178# ifdef USE_AS_WCSCMP 179 xorl %eax, %eax 180 movl (%rdi, %rdx), %ecx 181 cmpl (%rsi, %rdx), %ecx 182 jne L(wcscmp_return) 183# else 184 movzbl (%rdi, %rdx), %eax 185 movzbl (%rsi, %rdx), %edx 186 subl %edx, %eax 187# endif 188# else 189# ifdef USE_AS_WCSCMP 190 xorl %eax, %eax 191 movl VEC_SIZE(%rdi, %rdx), %ecx 192 cmpl VEC_SIZE(%rsi, %rdx), %ecx 193 jne L(wcscmp_return) 194# else 195 movzbl VEC_SIZE(%rdi, %rdx), %eax 196 movzbl VEC_SIZE(%rsi, %rdx), %edx 197 subl %edx, %eax 198# endif 199# endif 200 ret 201 202L(return_2_vec_size): 203 tzcntl %ecx, %edx 204# ifdef USE_AS_WCSCMP 205 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 206 sall $2, %edx 207# endif 208# ifdef USE_AS_STRNCMP 209 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is 210 after the maximum offset (%r11). */ 211 addq $(VEC_SIZE * 2), %rdx 212 cmpq %r11, %rdx 213 jae L(zero) 214# ifdef USE_AS_WCSCMP 215 xorl %eax, %eax 216 movl (%rdi, %rdx), %ecx 217 cmpl (%rsi, %rdx), %ecx 218 jne L(wcscmp_return) 219# else 220 movzbl (%rdi, %rdx), %eax 221 movzbl (%rsi, %rdx), %edx 222 subl %edx, %eax 223# endif 224# else 225# ifdef USE_AS_WCSCMP 226 xorl %eax, %eax 227 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx 228 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx 229 jne L(wcscmp_return) 230# else 231 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax 232 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx 233 subl %edx, %eax 234# endif 235# endif 236 ret 237 238L(return_3_vec_size): 239 tzcntl %ecx, %edx 240# ifdef USE_AS_WCSCMP 241 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 242 sall $2, %edx 243# endif 244# ifdef USE_AS_STRNCMP 245 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is 246 after the maximum offset (%r11). */ 247 addq $(VEC_SIZE * 3), %rdx 248 cmpq %r11, %rdx 249 jae L(zero) 250# ifdef USE_AS_WCSCMP 251 xorl %eax, %eax 252 movl (%rdi, %rdx), %ecx 253 cmpl (%rsi, %rdx), %ecx 254 jne L(wcscmp_return) 255# else 256 movzbl (%rdi, %rdx), %eax 257 movzbl (%rsi, %rdx), %edx 258 subl %edx, %eax 259# endif 260# else 261# ifdef USE_AS_WCSCMP 262 xorl %eax, %eax 263 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx 264 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx 265 jne L(wcscmp_return) 266# else 267 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax 268 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx 269 subl %edx, %eax 270# endif 271# endif 272 ret 273 274 .p2align 4 275L(next_3_vectors): 276 VMOVU VEC_SIZE(%rdi), %YMM0 277 /* Each bit set in K2 represents a non-null CHAR in YMM0. */ 278 VPTESTM %YMM0, %YMM0, %k2 279 /* Each bit cleared in K1 represents a mismatch or a null CHAR 280 in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ 281 VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} 282 kmovd %k1, %ecx 283# ifdef USE_AS_WCSCMP 284 subl $0xff, %ecx 285# else 286 incl %ecx 287# endif 288 jne L(return_vec_size) 289 290 VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 291 /* Each bit set in K2 represents a non-null CHAR in YMM0. */ 292 VPTESTM %YMM0, %YMM0, %k2 293 /* Each bit cleared in K1 represents a mismatch or a null CHAR 294 in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ 295 VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} 296 kmovd %k1, %ecx 297# ifdef USE_AS_WCSCMP 298 subl $0xff, %ecx 299# else 300 incl %ecx 301# endif 302 jne L(return_2_vec_size) 303 304 VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 305 /* Each bit set in K2 represents a non-null CHAR in YMM0. */ 306 VPTESTM %YMM0, %YMM0, %k2 307 /* Each bit cleared in K1 represents a mismatch or a null CHAR 308 in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ 309 VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} 310 kmovd %k1, %ecx 311# ifdef USE_AS_WCSCMP 312 subl $0xff, %ecx 313# else 314 incl %ecx 315# endif 316 jne L(return_3_vec_size) 317L(main_loop_header): 318 leaq (VEC_SIZE * 4)(%rdi), %rdx 319 movl $PAGE_SIZE, %ecx 320 /* Align load via RAX. */ 321 andq $-(VEC_SIZE * 4), %rdx 322 subq %rdi, %rdx 323 leaq (%rdi, %rdx), %rax 324# ifdef USE_AS_STRNCMP 325 /* Starting from this point, the maximum offset, or simply the 326 'offset', DECREASES by the same amount when base pointers are 327 moved forward. Return 0 when: 328 1) On match: offset <= the matched vector index. 329 2) On mistmach, offset is before the mistmatched index. 330 */ 331 subq %rdx, %r11 332 jbe L(zero) 333# endif 334 addq %rsi, %rdx 335 movq %rdx, %rsi 336 andl $(PAGE_SIZE - 1), %esi 337 /* Number of bytes before page crossing. */ 338 subq %rsi, %rcx 339 /* Number of VEC_SIZE * 4 blocks before page crossing. */ 340 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx 341 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ 342 movl %ecx, %esi 343 jmp L(loop_start) 344 345 .p2align 4 346L(loop): 347# ifdef USE_AS_STRNCMP 348 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease 349 the maximum offset (%r11) by the same amount. */ 350 subq $(VEC_SIZE * 4), %r11 351 jbe L(zero) 352# endif 353 addq $(VEC_SIZE * 4), %rax 354 addq $(VEC_SIZE * 4), %rdx 355L(loop_start): 356 testl %esi, %esi 357 leal -1(%esi), %esi 358 je L(loop_cross_page) 359L(back_to_loop): 360 /* Main loop, comparing 4 vectors are a time. */ 361 VMOVA (%rax), %YMM0 362 VMOVA VEC_SIZE(%rax), %YMM2 363 VMOVA (VEC_SIZE * 2)(%rax), %YMM4 364 VMOVA (VEC_SIZE * 3)(%rax), %YMM6 365 366 VPMINU %YMM0, %YMM2, %YMM8 367 VPMINU %YMM4, %YMM6, %YMM9 368 369 /* A zero CHAR in YMM8 means that there is a null CHAR. */ 370 VPMINU %YMM8, %YMM9, %YMM8 371 372 /* Each bit set in K1 represents a non-null CHAR in YMM8. */ 373 VPTESTM %YMM8, %YMM8, %k1 374 375 /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ 376 vpxorq (%rdx), %YMM0, %YMM1 377 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 378 vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 379 vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 380 381 vporq %YMM1, %YMM3, %YMM9 382 vporq %YMM5, %YMM7, %YMM10 383 384 /* A non-zero CHAR in YMM9 represents a mismatch. */ 385 vporq %YMM9, %YMM10, %YMM9 386 387 /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ 388 VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} 389 kmovd %k0, %ecx 390# ifdef USE_AS_WCSCMP 391 subl $0xff, %ecx 392# else 393 incl %ecx 394# endif 395 je L(loop) 396 397 /* Each bit set in K1 represents a non-null CHAR in YMM0. */ 398 VPTESTM %YMM0, %YMM0, %k1 399 /* Each bit cleared in K0 represents a mismatch or a null CHAR 400 in YMM0 and (%rdx). */ 401 VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} 402 kmovd %k0, %ecx 403# ifdef USE_AS_WCSCMP 404 subl $0xff, %ecx 405# else 406 incl %ecx 407# endif 408 je L(test_vec) 409 tzcntl %ecx, %ecx 410# ifdef USE_AS_WCSCMP 411 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 412 sall $2, %ecx 413# endif 414# ifdef USE_AS_STRNCMP 415 cmpq %rcx, %r11 416 jbe L(zero) 417# ifdef USE_AS_WCSCMP 418 movq %rax, %rsi 419 xorl %eax, %eax 420 movl (%rsi, %rcx), %edi 421 cmpl (%rdx, %rcx), %edi 422 jne L(wcscmp_return) 423# else 424 movzbl (%rax, %rcx), %eax 425 movzbl (%rdx, %rcx), %edx 426 subl %edx, %eax 427# endif 428# else 429# ifdef USE_AS_WCSCMP 430 movq %rax, %rsi 431 xorl %eax, %eax 432 movl (%rsi, %rcx), %edi 433 cmpl (%rdx, %rcx), %edi 434 jne L(wcscmp_return) 435# else 436 movzbl (%rax, %rcx), %eax 437 movzbl (%rdx, %rcx), %edx 438 subl %edx, %eax 439# endif 440# endif 441 ret 442 443 .p2align 4 444L(test_vec): 445# ifdef USE_AS_STRNCMP 446 /* The first vector matched. Return 0 if the maximum offset 447 (%r11) <= VEC_SIZE. */ 448 cmpq $VEC_SIZE, %r11 449 jbe L(zero) 450# endif 451 /* Each bit set in K1 represents a non-null CHAR in YMM2. */ 452 VPTESTM %YMM2, %YMM2, %k1 453 /* Each bit cleared in K0 represents a mismatch or a null CHAR 454 in YMM2 and VEC_SIZE(%rdx). */ 455 VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} 456 kmovd %k0, %ecx 457# ifdef USE_AS_WCSCMP 458 subl $0xff, %ecx 459# else 460 incl %ecx 461# endif 462 je L(test_2_vec) 463 tzcntl %ecx, %edi 464# ifdef USE_AS_WCSCMP 465 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 466 sall $2, %edi 467# endif 468# ifdef USE_AS_STRNCMP 469 addq $VEC_SIZE, %rdi 470 cmpq %rdi, %r11 471 jbe L(zero) 472# ifdef USE_AS_WCSCMP 473 movq %rax, %rsi 474 xorl %eax, %eax 475 movl (%rsi, %rdi), %ecx 476 cmpl (%rdx, %rdi), %ecx 477 jne L(wcscmp_return) 478# else 479 movzbl (%rax, %rdi), %eax 480 movzbl (%rdx, %rdi), %edx 481 subl %edx, %eax 482# endif 483# else 484# ifdef USE_AS_WCSCMP 485 movq %rax, %rsi 486 xorl %eax, %eax 487 movl VEC_SIZE(%rsi, %rdi), %ecx 488 cmpl VEC_SIZE(%rdx, %rdi), %ecx 489 jne L(wcscmp_return) 490# else 491 movzbl VEC_SIZE(%rax, %rdi), %eax 492 movzbl VEC_SIZE(%rdx, %rdi), %edx 493 subl %edx, %eax 494# endif 495# endif 496 ret 497 498 .p2align 4 499L(test_2_vec): 500# ifdef USE_AS_STRNCMP 501 /* The first 2 vectors matched. Return 0 if the maximum offset 502 (%r11) <= 2 * VEC_SIZE. */ 503 cmpq $(VEC_SIZE * 2), %r11 504 jbe L(zero) 505# endif 506 /* Each bit set in K1 represents a non-null CHAR in YMM4. */ 507 VPTESTM %YMM4, %YMM4, %k1 508 /* Each bit cleared in K0 represents a mismatch or a null CHAR 509 in YMM4 and (VEC_SIZE * 2)(%rdx). */ 510 VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} 511 kmovd %k0, %ecx 512# ifdef USE_AS_WCSCMP 513 subl $0xff, %ecx 514# else 515 incl %ecx 516# endif 517 je L(test_3_vec) 518 tzcntl %ecx, %edi 519# ifdef USE_AS_WCSCMP 520 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 521 sall $2, %edi 522# endif 523# ifdef USE_AS_STRNCMP 524 addq $(VEC_SIZE * 2), %rdi 525 cmpq %rdi, %r11 526 jbe L(zero) 527# ifdef USE_AS_WCSCMP 528 movq %rax, %rsi 529 xorl %eax, %eax 530 movl (%rsi, %rdi), %ecx 531 cmpl (%rdx, %rdi), %ecx 532 jne L(wcscmp_return) 533# else 534 movzbl (%rax, %rdi), %eax 535 movzbl (%rdx, %rdi), %edx 536 subl %edx, %eax 537# endif 538# else 539# ifdef USE_AS_WCSCMP 540 movq %rax, %rsi 541 xorl %eax, %eax 542 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx 543 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx 544 jne L(wcscmp_return) 545# else 546 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax 547 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx 548 subl %edx, %eax 549# endif 550# endif 551 ret 552 553 .p2align 4 554L(test_3_vec): 555# ifdef USE_AS_STRNCMP 556 /* The first 3 vectors matched. Return 0 if the maximum offset 557 (%r11) <= 3 * VEC_SIZE. */ 558 cmpq $(VEC_SIZE * 3), %r11 559 jbe L(zero) 560# endif 561 /* Each bit set in K1 represents a non-null CHAR in YMM6. */ 562 VPTESTM %YMM6, %YMM6, %k1 563 /* Each bit cleared in K0 represents a mismatch or a null CHAR 564 in YMM6 and (VEC_SIZE * 3)(%rdx). */ 565 VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} 566 kmovd %k0, %ecx 567# ifdef USE_AS_WCSCMP 568 subl $0xff, %ecx 569# else 570 incl %ecx 571# endif 572 tzcntl %ecx, %ecx 573# ifdef USE_AS_WCSCMP 574 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 575 sall $2, %ecx 576# endif 577# ifdef USE_AS_STRNCMP 578 addq $(VEC_SIZE * 3), %rcx 579 cmpq %rcx, %r11 580 jbe L(zero) 581# ifdef USE_AS_WCSCMP 582 movq %rax, %rsi 583 xorl %eax, %eax 584 movl (%rsi, %rcx), %esi 585 cmpl (%rdx, %rcx), %esi 586 jne L(wcscmp_return) 587# else 588 movzbl (%rax, %rcx), %eax 589 movzbl (%rdx, %rcx), %edx 590 subl %edx, %eax 591# endif 592# else 593# ifdef USE_AS_WCSCMP 594 movq %rax, %rsi 595 xorl %eax, %eax 596 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi 597 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi 598 jne L(wcscmp_return) 599# else 600 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax 601 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx 602 subl %edx, %eax 603# endif 604# endif 605 ret 606 607 .p2align 4 608L(loop_cross_page): 609 xorl %r10d, %r10d 610 movq %rdx, %rcx 611 /* Align load via RDX. We load the extra ECX bytes which should 612 be ignored. */ 613 andl $((VEC_SIZE * 4) - 1), %ecx 614 /* R10 is -RCX. */ 615 subq %rcx, %r10 616 617 /* This works only if VEC_SIZE * 2 == 64. */ 618# if (VEC_SIZE * 2) != 64 619# error (VEC_SIZE * 2) != 64 620# endif 621 622 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ 623 cmpl $(VEC_SIZE * 2), %ecx 624 jge L(loop_cross_page_2_vec) 625 626 VMOVU (%rax, %r10), %YMM2 627 VMOVU VEC_SIZE(%rax, %r10), %YMM3 628 629 /* Each bit set in K2 represents a non-null CHAR in YMM2. */ 630 VPTESTM %YMM2, %YMM2, %k2 631 /* Each bit cleared in K1 represents a mismatch or a null CHAR 632 in YMM2 and 32 bytes at (%rdx, %r10). */ 633 VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} 634 kmovd %k1, %r9d 635 /* Don't use subl since it is the lower 16/32 bits of RDI 636 below. */ 637 notl %r9d 638# ifdef USE_AS_WCSCMP 639 /* Only last 8 bits are valid. */ 640 andl $0xff, %r9d 641# endif 642 643 /* Each bit set in K4 represents a non-null CHAR in YMM3. */ 644 VPTESTM %YMM3, %YMM3, %k4 645 /* Each bit cleared in K3 represents a mismatch or a null CHAR 646 in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ 647 VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} 648 kmovd %k3, %edi 649 /* Must use notl %edi here as lower bits are for CHAR 650 comparisons potentially out of range thus can be 0 without 651 indicating mismatch. */ 652 notl %edi 653# ifdef USE_AS_WCSCMP 654 /* Don't use subl since it is the upper 8 bits of EDI below. */ 655 andl $0xff, %edi 656# endif 657 658# ifdef USE_AS_WCSCMP 659 /* NB: Each bit in EDI/R9D represents 4-byte element. */ 660 sall $8, %edi 661 /* NB: Divide shift count by 4 since each bit in K1 represent 4 662 bytes. */ 663 movl %ecx, %SHIFT_REG32 664 sarl $2, %SHIFT_REG32 665 666 /* Each bit in EDI represents a null CHAR or a mismatch. */ 667 orl %r9d, %edi 668# else 669 salq $32, %rdi 670 671 /* Each bit in RDI represents a null CHAR or a mismatch. */ 672 orq %r9, %rdi 673# endif 674 675 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ 676 shrxq %SHIFT_REG64, %rdi, %rdi 677 testq %rdi, %rdi 678 je L(loop_cross_page_2_vec) 679 tzcntq %rdi, %rcx 680# ifdef USE_AS_WCSCMP 681 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 682 sall $2, %ecx 683# endif 684# ifdef USE_AS_STRNCMP 685 cmpq %rcx, %r11 686 jbe L(zero) 687# ifdef USE_AS_WCSCMP 688 movq %rax, %rsi 689 xorl %eax, %eax 690 movl (%rsi, %rcx), %edi 691 cmpl (%rdx, %rcx), %edi 692 jne L(wcscmp_return) 693# else 694 movzbl (%rax, %rcx), %eax 695 movzbl (%rdx, %rcx), %edx 696 subl %edx, %eax 697# endif 698# else 699# ifdef USE_AS_WCSCMP 700 movq %rax, %rsi 701 xorl %eax, %eax 702 movl (%rsi, %rcx), %edi 703 cmpl (%rdx, %rcx), %edi 704 jne L(wcscmp_return) 705# else 706 movzbl (%rax, %rcx), %eax 707 movzbl (%rdx, %rcx), %edx 708 subl %edx, %eax 709# endif 710# endif 711 ret 712 713 .p2align 4 714L(loop_cross_page_2_vec): 715 /* The first VEC_SIZE * 2 bytes match or are ignored. */ 716 VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 717 VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 718 719 VPTESTM %YMM0, %YMM0, %k2 720 /* Each bit cleared in K1 represents a mismatch or a null CHAR 721 in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ 722 VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} 723 kmovd %k1, %r9d 724 /* Don't use subl since it is the lower 16/32 bits of RDI 725 below. */ 726 notl %r9d 727# ifdef USE_AS_WCSCMP 728 /* Only last 8 bits are valid. */ 729 andl $0xff, %r9d 730# endif 731 732 VPTESTM %YMM1, %YMM1, %k4 733 /* Each bit cleared in K3 represents a mismatch or a null CHAR 734 in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ 735 VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} 736 kmovd %k3, %edi 737 /* Must use notl %edi here as lower bits are for CHAR 738 comparisons potentially out of range thus can be 0 without 739 indicating mismatch. */ 740 notl %edi 741# ifdef USE_AS_WCSCMP 742 /* Don't use subl since it is the upper 8 bits of EDI below. */ 743 andl $0xff, %edi 744# endif 745 746# ifdef USE_AS_WCSCMP 747 /* NB: Each bit in EDI/R9D represents 4-byte element. */ 748 sall $8, %edi 749 750 /* Each bit in EDI represents a null CHAR or a mismatch. */ 751 orl %r9d, %edi 752# else 753 salq $32, %rdi 754 755 /* Each bit in RDI represents a null CHAR or a mismatch. */ 756 orq %r9, %rdi 757# endif 758 759 xorl %r8d, %r8d 760 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ 761 subl $(VEC_SIZE * 2), %ecx 762 jle 1f 763 /* R8 has number of bytes skipped. */ 764 movl %ecx, %r8d 765# ifdef USE_AS_WCSCMP 766 /* NB: Divide shift count by 4 since each bit in RDI represent 4 767 bytes. */ 768 sarl $2, %ecx 769 /* Skip ECX bytes. */ 770 shrl %cl, %edi 771# else 772 /* Skip ECX bytes. */ 773 shrq %cl, %rdi 774# endif 7751: 776 /* Before jumping back to the loop, set ESI to the number of 777 VEC_SIZE * 4 blocks before page crossing. */ 778 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi 779 780 testq %rdi, %rdi 781# ifdef USE_AS_STRNCMP 782 /* At this point, if %rdi value is 0, it already tested 783 VEC_SIZE*4+%r10 byte starting from %rax. This label 784 checks whether strncmp maximum offset reached or not. */ 785 je L(string_nbyte_offset_check) 786# else 787 je L(back_to_loop) 788# endif 789 tzcntq %rdi, %rcx 790# ifdef USE_AS_WCSCMP 791 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 792 sall $2, %ecx 793# endif 794 addq %r10, %rcx 795 /* Adjust for number of bytes skipped. */ 796 addq %r8, %rcx 797# ifdef USE_AS_STRNCMP 798 addq $(VEC_SIZE * 2), %rcx 799 subq %rcx, %r11 800 jbe L(zero) 801# ifdef USE_AS_WCSCMP 802 movq %rax, %rsi 803 xorl %eax, %eax 804 movl (%rsi, %rcx), %edi 805 cmpl (%rdx, %rcx), %edi 806 jne L(wcscmp_return) 807# else 808 movzbl (%rax, %rcx), %eax 809 movzbl (%rdx, %rcx), %edx 810 subl %edx, %eax 811# endif 812# else 813# ifdef USE_AS_WCSCMP 814 movq %rax, %rsi 815 xorl %eax, %eax 816 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi 817 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi 818 jne L(wcscmp_return) 819# else 820 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax 821 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx 822 subl %edx, %eax 823# endif 824# endif 825 ret 826 827# ifdef USE_AS_STRNCMP 828L(string_nbyte_offset_check): 829 leaq (VEC_SIZE * 4)(%r10), %r10 830 cmpq %r10, %r11 831 jbe L(zero) 832 jmp L(back_to_loop) 833# endif 834 835 .p2align 4 836L(cross_page_loop): 837 /* Check one byte/dword at a time. */ 838# ifdef USE_AS_WCSCMP 839 cmpl %ecx, %eax 840# else 841 subl %ecx, %eax 842# endif 843 jne L(different) 844 addl $SIZE_OF_CHAR, %edx 845 cmpl $(VEC_SIZE * 4), %edx 846 je L(main_loop_header) 847# ifdef USE_AS_STRNCMP 848 cmpq %r11, %rdx 849 jae L(zero) 850# endif 851# ifdef USE_AS_WCSCMP 852 movl (%rdi, %rdx), %eax 853 movl (%rsi, %rdx), %ecx 854# else 855 movzbl (%rdi, %rdx), %eax 856 movzbl (%rsi, %rdx), %ecx 857# endif 858 /* Check null CHAR. */ 859 testl %eax, %eax 860 jne L(cross_page_loop) 861 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED 862 comparisons. */ 863 subl %ecx, %eax 864# ifndef USE_AS_WCSCMP 865L(different): 866# endif 867 ret 868 869# ifdef USE_AS_WCSCMP 870 .p2align 4 871L(different): 872 /* Use movl to avoid modifying EFLAGS. */ 873 movl $0, %eax 874 setl %al 875 negl %eax 876 orl $1, %eax 877 ret 878# endif 879 880# ifdef USE_AS_STRNCMP 881 .p2align 4 882L(zero): 883 xorl %eax, %eax 884 ret 885 886 .p2align 4 887L(char0): 888# ifdef USE_AS_WCSCMP 889 xorl %eax, %eax 890 movl (%rdi), %ecx 891 cmpl (%rsi), %ecx 892 jne L(wcscmp_return) 893# else 894 movzbl (%rsi), %ecx 895 movzbl (%rdi), %eax 896 subl %ecx, %eax 897# endif 898 ret 899# endif 900 901 .p2align 4 902L(last_vector): 903 addq %rdx, %rdi 904 addq %rdx, %rsi 905# ifdef USE_AS_STRNCMP 906 subq %rdx, %r11 907# endif 908 tzcntl %ecx, %edx 909# ifdef USE_AS_WCSCMP 910 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 911 sall $2, %edx 912# endif 913# ifdef USE_AS_STRNCMP 914 cmpq %r11, %rdx 915 jae L(zero) 916# endif 917# ifdef USE_AS_WCSCMP 918 xorl %eax, %eax 919 movl (%rdi, %rdx), %ecx 920 cmpl (%rsi, %rdx), %ecx 921 jne L(wcscmp_return) 922# else 923 movzbl (%rdi, %rdx), %eax 924 movzbl (%rsi, %rdx), %edx 925 subl %edx, %eax 926# endif 927 ret 928 929 /* Comparing on page boundary region requires special treatment: 930 It must done one vector at the time, starting with the wider 931 ymm vector if possible, if not, with xmm. If fetching 16 bytes 932 (xmm) still passes the boundary, byte comparison must be done. 933 */ 934 .p2align 4 935L(cross_page): 936 /* Try one ymm vector at a time. */ 937 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 938 jg L(cross_page_1_vector) 939L(loop_1_vector): 940 VMOVU (%rdi, %rdx), %YMM0 941 942 VPTESTM %YMM0, %YMM0, %k2 943 /* Each bit cleared in K1 represents a mismatch or a null CHAR 944 in YMM0 and 32 bytes at (%rsi, %rdx). */ 945 VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} 946 kmovd %k1, %ecx 947# ifdef USE_AS_WCSCMP 948 subl $0xff, %ecx 949# else 950 incl %ecx 951# endif 952 jne L(last_vector) 953 954 addl $VEC_SIZE, %edx 955 956 addl $VEC_SIZE, %eax 957# ifdef USE_AS_STRNCMP 958 /* Return 0 if the current offset (%rdx) >= the maximum offset 959 (%r11). */ 960 cmpq %r11, %rdx 961 jae L(zero) 962# endif 963 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 964 jle L(loop_1_vector) 965L(cross_page_1_vector): 966 /* Less than 32 bytes to check, try one xmm vector. */ 967 cmpl $(PAGE_SIZE - 16), %eax 968 jg L(cross_page_1_xmm) 969 VMOVU (%rdi, %rdx), %XMM0 970 971 VPTESTM %YMM0, %YMM0, %k2 972 /* Each bit cleared in K1 represents a mismatch or a null CHAR 973 in XMM0 and 16 bytes at (%rsi, %rdx). */ 974 VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} 975 kmovd %k1, %ecx 976# ifdef USE_AS_WCSCMP 977 subl $0xf, %ecx 978# else 979 subl $0xffff, %ecx 980# endif 981 jne L(last_vector) 982 983 addl $16, %edx 984# ifndef USE_AS_WCSCMP 985 addl $16, %eax 986# endif 987# ifdef USE_AS_STRNCMP 988 /* Return 0 if the current offset (%rdx) >= the maximum offset 989 (%r11). */ 990 cmpq %r11, %rdx 991 jae L(zero) 992# endif 993 994L(cross_page_1_xmm): 995# ifndef USE_AS_WCSCMP 996 /* Less than 16 bytes to check, try 8 byte vector. NB: No need 997 for wcscmp nor wcsncmp since wide char is 4 bytes. */ 998 cmpl $(PAGE_SIZE - 8), %eax 999 jg L(cross_page_8bytes) 1000 vmovq (%rdi, %rdx), %XMM0 1001 vmovq (%rsi, %rdx), %XMM1 1002 1003 VPTESTM %YMM0, %YMM0, %k2 1004 /* Each bit cleared in K1 represents a mismatch or a null CHAR 1005 in XMM0 and XMM1. */ 1006 VPCMP $0, %XMM1, %XMM0, %k1{%k2} 1007 kmovb %k1, %ecx 1008# ifdef USE_AS_WCSCMP 1009 subl $0x3, %ecx 1010# else 1011 subl $0xff, %ecx 1012# endif 1013 jne L(last_vector) 1014 1015 addl $8, %edx 1016 addl $8, %eax 1017# ifdef USE_AS_STRNCMP 1018 /* Return 0 if the current offset (%rdx) >= the maximum offset 1019 (%r11). */ 1020 cmpq %r11, %rdx 1021 jae L(zero) 1022# endif 1023 1024L(cross_page_8bytes): 1025 /* Less than 8 bytes to check, try 4 byte vector. */ 1026 cmpl $(PAGE_SIZE - 4), %eax 1027 jg L(cross_page_4bytes) 1028 vmovd (%rdi, %rdx), %XMM0 1029 vmovd (%rsi, %rdx), %XMM1 1030 1031 VPTESTM %YMM0, %YMM0, %k2 1032 /* Each bit cleared in K1 represents a mismatch or a null CHAR 1033 in XMM0 and XMM1. */ 1034 VPCMP $0, %XMM1, %XMM0, %k1{%k2} 1035 kmovd %k1, %ecx 1036# ifdef USE_AS_WCSCMP 1037 subl $0x1, %ecx 1038# else 1039 subl $0xf, %ecx 1040# endif 1041 jne L(last_vector) 1042 1043 addl $4, %edx 1044# ifdef USE_AS_STRNCMP 1045 /* Return 0 if the current offset (%rdx) >= the maximum offset 1046 (%r11). */ 1047 cmpq %r11, %rdx 1048 jae L(zero) 1049# endif 1050 1051L(cross_page_4bytes): 1052# endif 1053 /* Less than 4 bytes to check, try one byte/dword at a time. */ 1054# ifdef USE_AS_STRNCMP 1055 cmpq %r11, %rdx 1056 jae L(zero) 1057# endif 1058# ifdef USE_AS_WCSCMP 1059 movl (%rdi, %rdx), %eax 1060 movl (%rsi, %rdx), %ecx 1061# else 1062 movzbl (%rdi, %rdx), %eax 1063 movzbl (%rsi, %rdx), %ecx 1064# endif 1065 testl %eax, %eax 1066 jne L(cross_page_loop) 1067 subl %ecx, %eax 1068 ret 1069END (STRCMP) 1070#endif 1071