1/* memcmp/wmemcmp optimized with AVX2. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21/* memcmp/wmemcmp is implemented as: 22 1. Use ymm vector compares when possible. The only case where 23 vector compares is not possible for when size < VEC_SIZE 24 and loading from either s1 or s2 would cause a page cross. 25 2. For size from 2 to 7 bytes on page cross, load as big endian 26 with movbe and bswap to avoid branches. 27 3. Use xmm vector compare when size >= 4 bytes for memcmp or 28 size >= 8 bytes for wmemcmp. 29 4. Optimistically compare up to first 4 * VEC_SIZE one at a 30 to check for early mismatches. Only do this if its guranteed the 31 work is not wasted. 32 5. If size is 8 * VEC_SIZE or less, unroll the loop. 33 6. Compare 4 * VEC_SIZE at a time with the aligned first memory 34 area. 35 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. 36 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. 37 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ 38 39 40# include <sysdep.h> 41 42# ifndef MEMCMP 43# define MEMCMP __memcmp_avx2_movbe 44# endif 45 46# ifdef USE_AS_WMEMCMP 47# define CHAR_SIZE 4 48# define VPCMPEQ vpcmpeqd 49# else 50# define CHAR_SIZE 1 51# define VPCMPEQ vpcmpeqb 52# endif 53 54# ifndef VZEROUPPER 55# define VZEROUPPER vzeroupper 56# endif 57 58# ifndef SECTION 59# define SECTION(p) p##.avx 60# endif 61 62# define VEC_SIZE 32 63# define PAGE_SIZE 4096 64 65/* Warning! 66 wmemcmp has to use SIGNED comparison for elements. 67 memcmp has to use UNSIGNED comparison for elemnts. 68*/ 69 70 .section SECTION(.text),"ax",@progbits 71ENTRY (MEMCMP) 72# ifdef USE_AS_WMEMCMP 73 shl $2, %RDX_LP 74# elif defined __ILP32__ 75 /* Clear the upper 32 bits. */ 76 movl %edx, %edx 77# endif 78 cmp $VEC_SIZE, %RDX_LP 79 jb L(less_vec) 80 81 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ 82 vmovdqu (%rsi), %ymm1 83 VPCMPEQ (%rdi), %ymm1, %ymm1 84 vpmovmskb %ymm1, %eax 85 /* NB: eax must be destination register if going to 86 L(return_vec_[0,2]). For L(return_vec_3 destination register 87 must be ecx. */ 88 incl %eax 89 jnz L(return_vec_0) 90 91 cmpq $(VEC_SIZE * 2), %rdx 92 jbe L(last_1x_vec) 93 94 /* Check second VEC no matter what. */ 95 vmovdqu VEC_SIZE(%rsi), %ymm2 96 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 97 vpmovmskb %ymm2, %eax 98 /* If all 4 VEC where equal eax will be all 1s so incl will 99 overflow and set zero flag. */ 100 incl %eax 101 jnz L(return_vec_1) 102 103 /* Less than 4 * VEC. */ 104 cmpq $(VEC_SIZE * 4), %rdx 105 jbe L(last_2x_vec) 106 107 /* Check third and fourth VEC no matter what. */ 108 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 109 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 110 vpmovmskb %ymm3, %eax 111 incl %eax 112 jnz L(return_vec_2) 113 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 114 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 115 vpmovmskb %ymm4, %ecx 116 incl %ecx 117 jnz L(return_vec_3) 118 119 /* Go to 4x VEC loop. */ 120 cmpq $(VEC_SIZE * 8), %rdx 121 ja L(more_8x_vec) 122 123 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any 124 branches. */ 125 126 /* Load first two VEC from s2 before adjusting addresses. */ 127 vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 128 vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 129 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi 130 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi 131 132 /* Wait to load from s1 until addressed adjust due to 133 unlamination of microfusion with complex address mode. */ 134 VPCMPEQ (%rdi), %ymm1, %ymm1 135 VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 136 137 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 138 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 139 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 140 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 141 142 /* Reduce VEC0 - VEC4. */ 143 vpand %ymm1, %ymm2, %ymm5 144 vpand %ymm3, %ymm4, %ymm6 145 vpand %ymm5, %ymm6, %ymm7 146 vpmovmskb %ymm7, %ecx 147 incl %ecx 148 jnz L(return_vec_0_1_2_3) 149 /* NB: eax must be zero to reach here. */ 150 VZEROUPPER_RETURN 151 152 .p2align 4 153L(return_vec_0): 154 tzcntl %eax, %eax 155# ifdef USE_AS_WMEMCMP 156 movl (%rdi, %rax), %ecx 157 xorl %edx, %edx 158 cmpl (%rsi, %rax), %ecx 159 /* NB: no partial register stall here because xorl zero idiom 160 above. */ 161 setg %dl 162 leal -1(%rdx, %rdx), %eax 163# else 164 movzbl (%rsi, %rax), %ecx 165 movzbl (%rdi, %rax), %eax 166 subl %ecx, %eax 167# endif 168L(return_vzeroupper): 169 ZERO_UPPER_VEC_REGISTERS_RETURN 170 171 .p2align 4 172L(return_vec_1): 173 tzcntl %eax, %eax 174# ifdef USE_AS_WMEMCMP 175 movl VEC_SIZE(%rdi, %rax), %ecx 176 xorl %edx, %edx 177 cmpl VEC_SIZE(%rsi, %rax), %ecx 178 setg %dl 179 leal -1(%rdx, %rdx), %eax 180# else 181 movzbl VEC_SIZE(%rsi, %rax), %ecx 182 movzbl VEC_SIZE(%rdi, %rax), %eax 183 subl %ecx, %eax 184# endif 185 VZEROUPPER_RETURN 186 187 .p2align 4 188L(return_vec_2): 189 tzcntl %eax, %eax 190# ifdef USE_AS_WMEMCMP 191 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx 192 xorl %edx, %edx 193 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx 194 setg %dl 195 leal -1(%rdx, %rdx), %eax 196# else 197 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx 198 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax 199 subl %ecx, %eax 200# endif 201 VZEROUPPER_RETURN 202 203 /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ 204 .p2align 5 205L(8x_return_vec_0_1_2_3): 206 /* Returning from L(more_8x_vec) requires restoring rsi. */ 207 addq %rdi, %rsi 208L(return_vec_0_1_2_3): 209 vpmovmskb %ymm1, %eax 210 incl %eax 211 jnz L(return_vec_0) 212 213 vpmovmskb %ymm2, %eax 214 incl %eax 215 jnz L(return_vec_1) 216 217 vpmovmskb %ymm3, %eax 218 incl %eax 219 jnz L(return_vec_2) 220L(return_vec_3): 221 tzcntl %ecx, %ecx 222# ifdef USE_AS_WMEMCMP 223 movl (VEC_SIZE * 3)(%rdi, %rcx), %eax 224 xorl %edx, %edx 225 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax 226 setg %dl 227 leal -1(%rdx, %rdx), %eax 228# else 229 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax 230 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx 231 subl %ecx, %eax 232# endif 233 VZEROUPPER_RETURN 234 235 .p2align 4 236L(more_8x_vec): 237 /* Set end of s1 in rdx. */ 238 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx 239 /* rsi stores s2 - s1. This allows loop to only update one 240 pointer. */ 241 subq %rdi, %rsi 242 /* Align s1 pointer. */ 243 andq $-VEC_SIZE, %rdi 244 /* Adjust because first 4x vec where check already. */ 245 subq $-(VEC_SIZE * 4), %rdi 246 .p2align 4 247L(loop_4x_vec): 248 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). 249 */ 250 vmovdqu (%rsi, %rdi), %ymm1 251 VPCMPEQ (%rdi), %ymm1, %ymm1 252 253 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 254 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 255 256 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 257 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 258 259 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 260 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 261 262 vpand %ymm1, %ymm2, %ymm5 263 vpand %ymm3, %ymm4, %ymm6 264 vpand %ymm5, %ymm6, %ymm7 265 vpmovmskb %ymm7, %ecx 266 incl %ecx 267 jnz L(8x_return_vec_0_1_2_3) 268 subq $-(VEC_SIZE * 4), %rdi 269 /* Check if s1 pointer at end. */ 270 cmpq %rdx, %rdi 271 jb L(loop_4x_vec) 272 273 subq %rdx, %rdi 274 /* rdi has 4 * VEC_SIZE - remaining length. */ 275 cmpl $(VEC_SIZE * 3), %edi 276 jae L(8x_last_1x_vec) 277 /* Load regardless of branch. */ 278 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 279 cmpl $(VEC_SIZE * 2), %edi 280 jae L(8x_last_2x_vec) 281 282 /* Check last 4 VEC. */ 283 vmovdqu (%rsi, %rdx), %ymm1 284 VPCMPEQ (%rdx), %ymm1, %ymm1 285 286 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 287 VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 288 289 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 290 291 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 292 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 293 294 vpand %ymm1, %ymm2, %ymm5 295 vpand %ymm3, %ymm4, %ymm6 296 vpand %ymm5, %ymm6, %ymm7 297 vpmovmskb %ymm7, %ecx 298 /* Restore s1 pointer to rdi. */ 299 movq %rdx, %rdi 300 incl %ecx 301 jnz L(8x_return_vec_0_1_2_3) 302 /* NB: eax must be zero to reach here. */ 303 VZEROUPPER_RETURN 304 305 /* Only entry is from L(more_8x_vec). */ 306 .p2align 4 307L(8x_last_2x_vec): 308 /* Check second to last VEC. rdx store end pointer of s1 and 309 ymm3 has already been loaded with second to last VEC from s2. 310 */ 311 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 312 vpmovmskb %ymm3, %eax 313 incl %eax 314 jnz L(8x_return_vec_2) 315 /* Check last VEC. */ 316 .p2align 4 317L(8x_last_1x_vec): 318 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 319 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 320 vpmovmskb %ymm4, %eax 321 incl %eax 322 jnz L(8x_return_vec_3) 323 VZEROUPPER_RETURN 324 325 .p2align 4 326L(last_2x_vec): 327 /* Check second to last VEC. */ 328 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 329 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 330 vpmovmskb %ymm1, %eax 331 incl %eax 332 jnz L(return_vec_1_end) 333 /* Check last VEC. */ 334L(last_1x_vec): 335 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 336 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 337 vpmovmskb %ymm1, %eax 338 incl %eax 339 jnz L(return_vec_0_end) 340 VZEROUPPER_RETURN 341 342 .p2align 4 343L(8x_return_vec_2): 344 subq $VEC_SIZE, %rdx 345L(8x_return_vec_3): 346 tzcntl %eax, %eax 347 addq %rdx, %rax 348# ifdef USE_AS_WMEMCMP 349 movl (VEC_SIZE * 3)(%rax), %ecx 350 xorl %edx, %edx 351 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx 352 setg %dl 353 leal -1(%rdx, %rdx), %eax 354# else 355 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx 356 movzbl (VEC_SIZE * 3)(%rax), %eax 357 subl %ecx, %eax 358# endif 359 VZEROUPPER_RETURN 360 361 .p2align 4 362L(return_vec_1_end): 363 tzcntl %eax, %eax 364 addl %edx, %eax 365# ifdef USE_AS_WMEMCMP 366 movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx 367 xorl %edx, %edx 368 cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx 369 setg %dl 370 leal -1(%rdx, %rdx), %eax 371# else 372 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx 373 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax 374 subl %ecx, %eax 375# endif 376 VZEROUPPER_RETURN 377 378 .p2align 4 379L(return_vec_0_end): 380 tzcntl %eax, %eax 381 addl %edx, %eax 382# ifdef USE_AS_WMEMCMP 383 movl -VEC_SIZE(%rdi, %rax), %ecx 384 xorl %edx, %edx 385 cmpl -VEC_SIZE(%rsi, %rax), %ecx 386 setg %dl 387 leal -1(%rdx, %rdx), %eax 388# else 389 movzbl -VEC_SIZE(%rsi, %rax), %ecx 390 movzbl -VEC_SIZE(%rdi, %rax), %eax 391 subl %ecx, %eax 392# endif 393 VZEROUPPER_RETURN 394 395 .p2align 4 396L(less_vec): 397 /* Check if one or less CHAR. This is necessary for size = 0 but 398 is also faster for size = CHAR_SIZE. */ 399 cmpl $CHAR_SIZE, %edx 400 jbe L(one_or_less) 401 402 /* Check if loading one VEC from either s1 or s2 could cause a 403 page cross. This can have false positives but is by far the 404 fastest method. */ 405 movl %edi, %eax 406 orl %esi, %eax 407 andl $(PAGE_SIZE - 1), %eax 408 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 409 jg L(page_cross_less_vec) 410 411 /* No page cross possible. */ 412 vmovdqu (%rsi), %ymm2 413 VPCMPEQ (%rdi), %ymm2, %ymm2 414 vpmovmskb %ymm2, %eax 415 incl %eax 416 /* Result will be zero if s1 and s2 match. Otherwise first set 417 bit will be first mismatch. */ 418 bzhil %edx, %eax, %edx 419 jnz L(return_vec_0) 420 xorl %eax, %eax 421 VZEROUPPER_RETURN 422 423 .p2align 4 424L(page_cross_less_vec): 425 /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 426 bytes. */ 427 cmpl $16, %edx 428 jae L(between_16_31) 429# ifndef USE_AS_WMEMCMP 430 cmpl $8, %edx 431 jae L(between_8_15) 432 cmpl $4, %edx 433 jae L(between_4_7) 434 435 /* Load as big endian to avoid branches. */ 436 movzwl (%rdi), %eax 437 movzwl (%rsi), %ecx 438 shll $8, %eax 439 shll $8, %ecx 440 bswap %eax 441 bswap %ecx 442 movzbl -1(%rdi, %rdx), %edi 443 movzbl -1(%rsi, %rdx), %esi 444 orl %edi, %eax 445 orl %esi, %ecx 446 /* Subtraction is okay because the upper 8 bits are zero. */ 447 subl %ecx, %eax 448 /* No ymm register was touched. */ 449 ret 450 451 .p2align 4 452L(one_or_less): 453 jb L(zero) 454 movzbl (%rsi), %ecx 455 movzbl (%rdi), %eax 456 subl %ecx, %eax 457 /* No ymm register was touched. */ 458 ret 459 460 .p2align 4 461L(between_8_15): 462# endif 463 /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ 464 vmovq (%rdi), %xmm1 465 vmovq (%rsi), %xmm2 466 VPCMPEQ %xmm1, %xmm2, %xmm2 467 vpmovmskb %xmm2, %eax 468 subl $0xffff, %eax 469 jnz L(return_vec_0) 470 /* Use overlapping loads to avoid branches. */ 471 leaq -8(%rdi, %rdx), %rdi 472 leaq -8(%rsi, %rdx), %rsi 473 vmovq (%rdi), %xmm1 474 vmovq (%rsi), %xmm2 475 VPCMPEQ %xmm1, %xmm2, %xmm2 476 vpmovmskb %xmm2, %eax 477 subl $0xffff, %eax 478 jnz L(return_vec_0) 479 /* No ymm register was touched. */ 480 ret 481 482 .p2align 4 483L(zero): 484 xorl %eax, %eax 485 ret 486 487 .p2align 4 488L(between_16_31): 489 /* From 16 to 31 bytes. No branch when size == 16. */ 490 vmovdqu (%rsi), %xmm2 491 VPCMPEQ (%rdi), %xmm2, %xmm2 492 vpmovmskb %xmm2, %eax 493 subl $0xffff, %eax 494 jnz L(return_vec_0) 495 496 /* Use overlapping loads to avoid branches. */ 497 498 vmovdqu -16(%rsi, %rdx), %xmm2 499 leaq -16(%rdi, %rdx), %rdi 500 leaq -16(%rsi, %rdx), %rsi 501 VPCMPEQ (%rdi), %xmm2, %xmm2 502 vpmovmskb %xmm2, %eax 503 subl $0xffff, %eax 504 jnz L(return_vec_0) 505 /* No ymm register was touched. */ 506 ret 507 508# ifdef USE_AS_WMEMCMP 509 .p2align 4 510L(one_or_less): 511 jb L(zero) 512 movl (%rdi), %ecx 513 xorl %edx, %edx 514 cmpl (%rsi), %ecx 515 je L(zero) 516 setg %dl 517 leal -1(%rdx, %rdx), %eax 518 /* No ymm register was touched. */ 519 ret 520# else 521 522 .p2align 4 523L(between_4_7): 524 /* Load as big endian with overlapping movbe to avoid branches. 525 */ 526 movbe (%rdi), %eax 527 movbe (%rsi), %ecx 528 shlq $32, %rax 529 shlq $32, %rcx 530 movbe -4(%rdi, %rdx), %edi 531 movbe -4(%rsi, %rdx), %esi 532 orq %rdi, %rax 533 orq %rsi, %rcx 534 subq %rcx, %rax 535 jz L(zero_4_7) 536 sbbl %eax, %eax 537 orl $1, %eax 538L(zero_4_7): 539 /* No ymm register was touched. */ 540 ret 541# endif 542 543END (MEMCMP) 544#endif 545