1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRLEN 24# define STRLEN __strlen_avx2 25# endif 26 27# ifdef USE_AS_WCSLEN 28# define VPCMPEQ vpcmpeqd 29# define VPMINU vpminud 30# define CHAR_SIZE 4 31# else 32# define VPCMPEQ vpcmpeqb 33# define VPMINU vpminub 34# define CHAR_SIZE 1 35# endif 36 37# ifndef VZEROUPPER 38# define VZEROUPPER vzeroupper 39# endif 40 41# ifndef SECTION 42# define SECTION(p) p##.avx 43# endif 44 45# define VEC_SIZE 32 46# define PAGE_SIZE 4096 47# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 48 49 .section SECTION(.text),"ax",@progbits 50ENTRY (STRLEN) 51# ifdef USE_AS_STRNLEN 52 /* Check zero length. */ 53# ifdef __ILP32__ 54 /* Clear upper bits. */ 55 and %RSI_LP, %RSI_LP 56# else 57 test %RSI_LP, %RSI_LP 58# endif 59 jz L(zero) 60 /* Store max len in R8_LP before adjusting if using WCSLEN. */ 61 mov %RSI_LP, %R8_LP 62# endif 63 movl %edi, %eax 64 movq %rdi, %rdx 65 vpxor %xmm0, %xmm0, %xmm0 66 /* Clear high bits from edi. Only keeping bits relevant to page 67 cross check. */ 68 andl $(PAGE_SIZE - 1), %eax 69 /* Check if we may cross page boundary with one vector load. */ 70 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 71 ja L(cross_page_boundary) 72 73 /* Check the first VEC_SIZE bytes. */ 74 VPCMPEQ (%rdi), %ymm0, %ymm1 75 vpmovmskb %ymm1, %eax 76# ifdef USE_AS_STRNLEN 77 /* If length < VEC_SIZE handle special. */ 78 cmpq $CHAR_PER_VEC, %rsi 79 jbe L(first_vec_x0) 80# endif 81 /* If empty continue to aligned_more. Otherwise return bit 82 position of first match. */ 83 testl %eax, %eax 84 jz L(aligned_more) 85 tzcntl %eax, %eax 86# ifdef USE_AS_WCSLEN 87 /* NB: Divide bytes by 4 to get wchar_t count. */ 88 shrl $2, %eax 89# endif 90 VZEROUPPER_RETURN 91 92# ifdef USE_AS_STRNLEN 93L(zero): 94 xorl %eax, %eax 95 ret 96 97 .p2align 4 98L(first_vec_x0): 99 /* Set bit for max len so that tzcnt will return min of max len 100 and position of first match. */ 101# ifdef USE_AS_WCSLEN 102 /* NB: Multiply length by 4 to get byte count. */ 103 sall $2, %esi 104# endif 105 btsq %rsi, %rax 106 tzcntl %eax, %eax 107# ifdef USE_AS_WCSLEN 108 /* NB: Divide bytes by 4 to get wchar_t count. */ 109 shrl $2, %eax 110# endif 111 VZEROUPPER_RETURN 112# endif 113 114 .p2align 4 115L(first_vec_x1): 116 tzcntl %eax, %eax 117 /* Safe to use 32 bit instructions as these are only called for 118 size = [1, 159]. */ 119# ifdef USE_AS_STRNLEN 120 /* Use ecx which was computed earlier to compute correct value. 121 */ 122# ifdef USE_AS_WCSLEN 123 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax 124# else 125 subl $(VEC_SIZE * 4 + 1), %ecx 126 addl %ecx, %eax 127# endif 128# else 129 subl %edx, %edi 130 incl %edi 131 addl %edi, %eax 132# endif 133# ifdef USE_AS_WCSLEN 134 /* NB: Divide bytes by 4 to get wchar_t count. */ 135 shrl $2, %eax 136# endif 137 VZEROUPPER_RETURN 138 139 .p2align 4 140L(first_vec_x2): 141 tzcntl %eax, %eax 142 /* Safe to use 32 bit instructions as these are only called for 143 size = [1, 159]. */ 144# ifdef USE_AS_STRNLEN 145 /* Use ecx which was computed earlier to compute correct value. 146 */ 147# ifdef USE_AS_WCSLEN 148 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax 149# else 150 subl $(VEC_SIZE * 3 + 1), %ecx 151 addl %ecx, %eax 152# endif 153# else 154 subl %edx, %edi 155 addl $(VEC_SIZE + 1), %edi 156 addl %edi, %eax 157# endif 158# ifdef USE_AS_WCSLEN 159 /* NB: Divide bytes by 4 to get wchar_t count. */ 160 shrl $2, %eax 161# endif 162 VZEROUPPER_RETURN 163 164 .p2align 4 165L(first_vec_x3): 166 tzcntl %eax, %eax 167 /* Safe to use 32 bit instructions as these are only called for 168 size = [1, 159]. */ 169# ifdef USE_AS_STRNLEN 170 /* Use ecx which was computed earlier to compute correct value. 171 */ 172# ifdef USE_AS_WCSLEN 173 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax 174# else 175 subl $(VEC_SIZE * 2 + 1), %ecx 176 addl %ecx, %eax 177# endif 178# else 179 subl %edx, %edi 180 addl $(VEC_SIZE * 2 + 1), %edi 181 addl %edi, %eax 182# endif 183# ifdef USE_AS_WCSLEN 184 /* NB: Divide bytes by 4 to get wchar_t count. */ 185 shrl $2, %eax 186# endif 187 VZEROUPPER_RETURN 188 189 .p2align 4 190L(first_vec_x4): 191 tzcntl %eax, %eax 192 /* Safe to use 32 bit instructions as these are only called for 193 size = [1, 159]. */ 194# ifdef USE_AS_STRNLEN 195 /* Use ecx which was computed earlier to compute correct value. 196 */ 197# ifdef USE_AS_WCSLEN 198 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax 199# else 200 subl $(VEC_SIZE + 1), %ecx 201 addl %ecx, %eax 202# endif 203# else 204 subl %edx, %edi 205 addl $(VEC_SIZE * 3 + 1), %edi 206 addl %edi, %eax 207# endif 208# ifdef USE_AS_WCSLEN 209 /* NB: Divide bytes by 4 to get wchar_t count. */ 210 shrl $2, %eax 211# endif 212 VZEROUPPER_RETURN 213 214 .p2align 5 215L(aligned_more): 216 /* Align data to VEC_SIZE - 1. This is the same number of 217 instructions as using andq with -VEC_SIZE but saves 4 bytes of 218 code on the x4 check. */ 219 orq $(VEC_SIZE - 1), %rdi 220L(cross_page_continue): 221 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 222 since data is only aligned to VEC_SIZE. */ 223# ifdef USE_AS_STRNLEN 224 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE 225 because it simplies the logic in last_4x_vec_or_less. */ 226 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx 227 subq %rdx, %rcx 228# ifdef USE_AS_WCSLEN 229 /* NB: Divide bytes by 4 to get the wchar_t count. */ 230 sarl $2, %ecx 231# endif 232# endif 233 /* Load first VEC regardless. */ 234 VPCMPEQ 1(%rdi), %ymm0, %ymm1 235# ifdef USE_AS_STRNLEN 236 /* Adjust length. If near end handle specially. */ 237 subq %rcx, %rsi 238 jb L(last_4x_vec_or_less) 239# endif 240 vpmovmskb %ymm1, %eax 241 testl %eax, %eax 242 jnz L(first_vec_x1) 243 244 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 245 vpmovmskb %ymm1, %eax 246 testl %eax, %eax 247 jnz L(first_vec_x2) 248 249 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 250 vpmovmskb %ymm1, %eax 251 testl %eax, %eax 252 jnz L(first_vec_x3) 253 254 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 255 vpmovmskb %ymm1, %eax 256 testl %eax, %eax 257 jnz L(first_vec_x4) 258 259 /* Align data to VEC_SIZE * 4 - 1. */ 260# ifdef USE_AS_STRNLEN 261 /* Before adjusting length check if at last VEC_SIZE * 4. */ 262 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi 263 jbe L(last_4x_vec_or_less_load) 264 incq %rdi 265 movl %edi, %ecx 266 orq $(VEC_SIZE * 4 - 1), %rdi 267 andl $(VEC_SIZE * 4 - 1), %ecx 268# ifdef USE_AS_WCSLEN 269 /* NB: Divide bytes by 4 to get the wchar_t count. */ 270 sarl $2, %ecx 271# endif 272 /* Readjust length. */ 273 addq %rcx, %rsi 274# else 275 incq %rdi 276 orq $(VEC_SIZE * 4 - 1), %rdi 277# endif 278 /* Compare 4 * VEC at a time forward. */ 279 .p2align 4 280L(loop_4x_vec): 281# ifdef USE_AS_STRNLEN 282 /* Break if at end of length. */ 283 subq $(CHAR_PER_VEC * 4), %rsi 284 jb L(last_4x_vec_or_less_cmpeq) 285# endif 286 /* Save some code size by microfusing VPMINU with the load. 287 Since the matches in ymm2/ymm4 can only be returned if there 288 where no matches in ymm1/ymm3 respectively there is no issue 289 with overlap. */ 290 vmovdqa 1(%rdi), %ymm1 291 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 292 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 293 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 294 295 VPMINU %ymm2, %ymm4, %ymm5 296 VPCMPEQ %ymm5, %ymm0, %ymm5 297 vpmovmskb %ymm5, %ecx 298 299 subq $-(VEC_SIZE * 4), %rdi 300 testl %ecx, %ecx 301 jz L(loop_4x_vec) 302 303 304 VPCMPEQ %ymm1, %ymm0, %ymm1 305 vpmovmskb %ymm1, %eax 306 subq %rdx, %rdi 307 testl %eax, %eax 308 jnz L(last_vec_return_x0) 309 310 VPCMPEQ %ymm2, %ymm0, %ymm2 311 vpmovmskb %ymm2, %eax 312 testl %eax, %eax 313 jnz L(last_vec_return_x1) 314 315 /* Combine last 2 VEC. */ 316 VPCMPEQ %ymm3, %ymm0, %ymm3 317 vpmovmskb %ymm3, %eax 318 /* rcx has combined result from all 4 VEC. It will only be used 319 if the first 3 other VEC all did not contain a match. */ 320 salq $32, %rcx 321 orq %rcx, %rax 322 tzcntq %rax, %rax 323 subq $(VEC_SIZE * 2 - 1), %rdi 324 addq %rdi, %rax 325# ifdef USE_AS_WCSLEN 326 /* NB: Divide bytes by 4 to get wchar_t count. */ 327 shrq $2, %rax 328# endif 329 VZEROUPPER_RETURN 330 331 332# ifdef USE_AS_STRNLEN 333 .p2align 4 334L(last_4x_vec_or_less_load): 335 /* Depending on entry adjust rdi / prepare first VEC in ymm1. 336 */ 337 subq $-(VEC_SIZE * 4), %rdi 338L(last_4x_vec_or_less_cmpeq): 339 VPCMPEQ 1(%rdi), %ymm0, %ymm1 340L(last_4x_vec_or_less): 341# ifdef USE_AS_WCSLEN 342 /* NB: Multiply length by 4 to get byte count. */ 343 sall $2, %esi 344# endif 345 vpmovmskb %ymm1, %eax 346 /* If remaining length > VEC_SIZE * 2. This works if esi is off 347 by VEC_SIZE * 4. */ 348 testl $(VEC_SIZE * 2), %esi 349 jnz L(last_4x_vec) 350 351 /* length may have been negative or positive by an offset of 352 VEC_SIZE * 4 depending on where this was called from. This fixes 353 that. */ 354 andl $(VEC_SIZE * 4 - 1), %esi 355 testl %eax, %eax 356 jnz L(last_vec_x1_check) 357 358 subl $VEC_SIZE, %esi 359 jb L(max) 360 361 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 362 vpmovmskb %ymm1, %eax 363 tzcntl %eax, %eax 364 /* Check the end of data. */ 365 cmpl %eax, %esi 366 jb L(max) 367 subq %rdx, %rdi 368 addl $(VEC_SIZE + 1), %eax 369 addq %rdi, %rax 370# ifdef USE_AS_WCSLEN 371 /* NB: Divide bytes by 4 to get wchar_t count. */ 372 shrq $2, %rax 373# endif 374 VZEROUPPER_RETURN 375# endif 376 377 .p2align 4 378L(last_vec_return_x0): 379 tzcntl %eax, %eax 380 subq $(VEC_SIZE * 4 - 1), %rdi 381 addq %rdi, %rax 382# ifdef USE_AS_WCSLEN 383 /* NB: Divide bytes by 4 to get wchar_t count. */ 384 shrq $2, %rax 385# endif 386 VZEROUPPER_RETURN 387 388 .p2align 4 389L(last_vec_return_x1): 390 tzcntl %eax, %eax 391 subq $(VEC_SIZE * 3 - 1), %rdi 392 addq %rdi, %rax 393# ifdef USE_AS_WCSLEN 394 /* NB: Divide bytes by 4 to get wchar_t count. */ 395 shrq $2, %rax 396# endif 397 VZEROUPPER_RETURN 398 399# ifdef USE_AS_STRNLEN 400 .p2align 4 401L(last_vec_x1_check): 402 403 tzcntl %eax, %eax 404 /* Check the end of data. */ 405 cmpl %eax, %esi 406 jb L(max) 407 subq %rdx, %rdi 408 incl %eax 409 addq %rdi, %rax 410# ifdef USE_AS_WCSLEN 411 /* NB: Divide bytes by 4 to get wchar_t count. */ 412 shrq $2, %rax 413# endif 414 VZEROUPPER_RETURN 415 416L(max): 417 movq %r8, %rax 418 VZEROUPPER_RETURN 419 420 .p2align 4 421L(last_4x_vec): 422 /* Test first 2x VEC normally. */ 423 testl %eax, %eax 424 jnz L(last_vec_x1) 425 426 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 427 vpmovmskb %ymm1, %eax 428 testl %eax, %eax 429 jnz L(last_vec_x2) 430 431 /* Normalize length. */ 432 andl $(VEC_SIZE * 4 - 1), %esi 433 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 434 vpmovmskb %ymm1, %eax 435 testl %eax, %eax 436 jnz L(last_vec_x3) 437 438 subl $(VEC_SIZE * 3), %esi 439 jb L(max) 440 441 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 442 vpmovmskb %ymm1, %eax 443 tzcntl %eax, %eax 444 /* Check the end of data. */ 445 cmpl %eax, %esi 446 jb L(max) 447 subq %rdx, %rdi 448 addl $(VEC_SIZE * 3 + 1), %eax 449 addq %rdi, %rax 450# ifdef USE_AS_WCSLEN 451 /* NB: Divide bytes by 4 to get wchar_t count. */ 452 shrq $2, %rax 453# endif 454 VZEROUPPER_RETURN 455 456 457 .p2align 4 458L(last_vec_x1): 459 /* essentially duplicates of first_vec_x1 but use 64 bit 460 instructions. */ 461 tzcntl %eax, %eax 462 subq %rdx, %rdi 463 incl %eax 464 addq %rdi, %rax 465# ifdef USE_AS_WCSLEN 466 /* NB: Divide bytes by 4 to get wchar_t count. */ 467 shrq $2, %rax 468# endif 469 VZEROUPPER_RETURN 470 471 .p2align 4 472L(last_vec_x2): 473 /* essentially duplicates of first_vec_x1 but use 64 bit 474 instructions. */ 475 tzcntl %eax, %eax 476 subq %rdx, %rdi 477 addl $(VEC_SIZE + 1), %eax 478 addq %rdi, %rax 479# ifdef USE_AS_WCSLEN 480 /* NB: Divide bytes by 4 to get wchar_t count. */ 481 shrq $2, %rax 482# endif 483 VZEROUPPER_RETURN 484 485 .p2align 4 486L(last_vec_x3): 487 tzcntl %eax, %eax 488 subl $(VEC_SIZE * 2), %esi 489 /* Check the end of data. */ 490 cmpl %eax, %esi 491 jb L(max_end) 492 subq %rdx, %rdi 493 addl $(VEC_SIZE * 2 + 1), %eax 494 addq %rdi, %rax 495# ifdef USE_AS_WCSLEN 496 /* NB: Divide bytes by 4 to get wchar_t count. */ 497 shrq $2, %rax 498# endif 499 VZEROUPPER_RETURN 500L(max_end): 501 movq %r8, %rax 502 VZEROUPPER_RETURN 503# endif 504 505 /* Cold case for crossing page with first load. */ 506 .p2align 4 507L(cross_page_boundary): 508 /* Align data to VEC_SIZE - 1. */ 509 orq $(VEC_SIZE - 1), %rdi 510 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 511 vpmovmskb %ymm1, %eax 512 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT 513 so no need to manually mod rdx. */ 514 sarxl %edx, %eax, %eax 515# ifdef USE_AS_STRNLEN 516 testl %eax, %eax 517 jnz L(cross_page_less_vec) 518 leaq 1(%rdi), %rcx 519 subq %rdx, %rcx 520# ifdef USE_AS_WCSLEN 521 /* NB: Divide bytes by 4 to get wchar_t count. */ 522 shrl $2, %ecx 523# endif 524 /* Check length. */ 525 cmpq %rsi, %rcx 526 jb L(cross_page_continue) 527 movq %r8, %rax 528# else 529 testl %eax, %eax 530 jz L(cross_page_continue) 531 tzcntl %eax, %eax 532# ifdef USE_AS_WCSLEN 533 /* NB: Divide length by 4 to get wchar_t count. */ 534 shrl $2, %eax 535# endif 536# endif 537L(return_vzeroupper): 538 ZERO_UPPER_VEC_REGISTERS_RETURN 539 540# ifdef USE_AS_STRNLEN 541 .p2align 4 542L(cross_page_less_vec): 543 tzcntl %eax, %eax 544# ifdef USE_AS_WCSLEN 545 /* NB: Multiply length by 4 to get byte count. */ 546 sall $2, %esi 547# endif 548 cmpq %rax, %rsi 549 cmovb %esi, %eax 550# ifdef USE_AS_WCSLEN 551 shrl $2, %eax 552# endif 553 VZEROUPPER_RETURN 554# endif 555 556END (STRLEN) 557#endif 558