1/* memchr/wmemchr optimized with AVX2. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef MEMCHR 24# define MEMCHR __memchr_avx2 25# endif 26 27# ifdef USE_AS_WMEMCHR 28# define VPCMPEQ vpcmpeqd 29# define VPBROADCAST vpbroadcastd 30# define CHAR_SIZE 4 31# else 32# define VPCMPEQ vpcmpeqb 33# define VPBROADCAST vpbroadcastb 34# define CHAR_SIZE 1 35# endif 36 37# ifdef USE_AS_RAWMEMCHR 38# define ERAW_PTR_REG ecx 39# define RRAW_PTR_REG rcx 40# define ALGN_PTR_REG rdi 41# else 42# define ERAW_PTR_REG edi 43# define RRAW_PTR_REG rdi 44# define ALGN_PTR_REG rcx 45# endif 46 47# ifndef VZEROUPPER 48# define VZEROUPPER vzeroupper 49# endif 50 51# ifndef SECTION 52# define SECTION(p) p##.avx 53# endif 54 55# define VEC_SIZE 32 56# define PAGE_SIZE 4096 57# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 58 59 .section SECTION(.text),"ax",@progbits 60ENTRY (MEMCHR) 61# ifndef USE_AS_RAWMEMCHR 62 /* Check for zero length. */ 63# ifdef __ILP32__ 64 /* Clear upper bits. */ 65 and %RDX_LP, %RDX_LP 66# else 67 test %RDX_LP, %RDX_LP 68# endif 69 jz L(null) 70# endif 71 /* Broadcast CHAR to YMMMATCH. */ 72 vmovd %esi, %xmm0 73 VPBROADCAST %xmm0, %ymm0 74 /* Check if we may cross page boundary with one vector load. */ 75 movl %edi, %eax 76 andl $(PAGE_SIZE - 1), %eax 77 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 78 ja L(cross_page_boundary) 79 80 /* Check the first VEC_SIZE bytes. */ 81 VPCMPEQ (%rdi), %ymm0, %ymm1 82 vpmovmskb %ymm1, %eax 83# ifndef USE_AS_RAWMEMCHR 84 /* If length < CHAR_PER_VEC handle special. */ 85 cmpq $CHAR_PER_VEC, %rdx 86 jbe L(first_vec_x0) 87# endif 88 testl %eax, %eax 89 jz L(aligned_more) 90 tzcntl %eax, %eax 91 addq %rdi, %rax 92 VZEROUPPER_RETURN 93 94# ifndef USE_AS_RAWMEMCHR 95 .p2align 5 96L(first_vec_x0): 97 /* Check if first match was before length. */ 98 tzcntl %eax, %eax 99# ifdef USE_AS_WMEMCHR 100 /* NB: Multiply length by 4 to get byte count. */ 101 sall $2, %edx 102# endif 103 xorl %ecx, %ecx 104 cmpl %eax, %edx 105 leaq (%rdi, %rax), %rax 106 cmovle %rcx, %rax 107 VZEROUPPER_RETURN 108 109L(null): 110 xorl %eax, %eax 111 ret 112# endif 113 .p2align 4 114L(cross_page_boundary): 115 /* Save pointer before aligning as its original value is 116 necessary for computer return address if byte is found or 117 adjusting length if it is not and this is memchr. */ 118 movq %rdi, %rcx 119 /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr 120 and rdi for rawmemchr. */ 121 orq $(VEC_SIZE - 1), %ALGN_PTR_REG 122 VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 123 vpmovmskb %ymm1, %eax 124# ifndef USE_AS_RAWMEMCHR 125 /* Calculate length until end of page (length checked for a 126 match). */ 127 leaq 1(%ALGN_PTR_REG), %rsi 128 subq %RRAW_PTR_REG, %rsi 129# ifdef USE_AS_WMEMCHR 130 /* NB: Divide bytes by 4 to get wchar_t count. */ 131 shrl $2, %esi 132# endif 133# endif 134 /* Remove the leading bytes. */ 135 sarxl %ERAW_PTR_REG, %eax, %eax 136# ifndef USE_AS_RAWMEMCHR 137 /* Check the end of data. */ 138 cmpq %rsi, %rdx 139 jbe L(first_vec_x0) 140# endif 141 testl %eax, %eax 142 jz L(cross_page_continue) 143 tzcntl %eax, %eax 144 addq %RRAW_PTR_REG, %rax 145L(return_vzeroupper): 146 ZERO_UPPER_VEC_REGISTERS_RETURN 147 148 .p2align 4 149L(first_vec_x1): 150 tzcntl %eax, %eax 151 incq %rdi 152 addq %rdi, %rax 153 VZEROUPPER_RETURN 154 155 .p2align 4 156L(first_vec_x2): 157 tzcntl %eax, %eax 158 addq $(VEC_SIZE + 1), %rdi 159 addq %rdi, %rax 160 VZEROUPPER_RETURN 161 162 .p2align 4 163L(first_vec_x3): 164 tzcntl %eax, %eax 165 addq $(VEC_SIZE * 2 + 1), %rdi 166 addq %rdi, %rax 167 VZEROUPPER_RETURN 168 169 170 .p2align 4 171L(first_vec_x4): 172 tzcntl %eax, %eax 173 addq $(VEC_SIZE * 3 + 1), %rdi 174 addq %rdi, %rax 175 VZEROUPPER_RETURN 176 177 .p2align 4 178L(aligned_more): 179 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 180 since data is only aligned to VEC_SIZE. */ 181 182# ifndef USE_AS_RAWMEMCHR 183L(cross_page_continue): 184 /* Align data to VEC_SIZE - 1. */ 185 xorl %ecx, %ecx 186 subl %edi, %ecx 187 orq $(VEC_SIZE - 1), %rdi 188 /* esi is for adjusting length to see if near the end. */ 189 leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi 190# ifdef USE_AS_WMEMCHR 191 /* NB: Divide bytes by 4 to get the wchar_t count. */ 192 sarl $2, %esi 193# endif 194# else 195 orq $(VEC_SIZE - 1), %rdi 196L(cross_page_continue): 197# endif 198 /* Load first VEC regardless. */ 199 VPCMPEQ 1(%rdi), %ymm0, %ymm1 200 vpmovmskb %ymm1, %eax 201# ifndef USE_AS_RAWMEMCHR 202 /* Adjust length. If near end handle specially. */ 203 subq %rsi, %rdx 204 jbe L(last_4x_vec_or_less) 205# endif 206 testl %eax, %eax 207 jnz L(first_vec_x1) 208 209 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 210 vpmovmskb %ymm1, %eax 211 testl %eax, %eax 212 jnz L(first_vec_x2) 213 214 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 215 vpmovmskb %ymm1, %eax 216 testl %eax, %eax 217 jnz L(first_vec_x3) 218 219 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 220 vpmovmskb %ymm1, %eax 221 testl %eax, %eax 222 jnz L(first_vec_x4) 223 224# ifndef USE_AS_RAWMEMCHR 225 /* Check if at last VEC_SIZE * 4 length. */ 226 subq $(CHAR_PER_VEC * 4), %rdx 227 jbe L(last_4x_vec_or_less_cmpeq) 228 /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust 229 length. */ 230 incq %rdi 231 movl %edi, %ecx 232 orq $(VEC_SIZE * 4 - 1), %rdi 233 andl $(VEC_SIZE * 4 - 1), %ecx 234# ifdef USE_AS_WMEMCHR 235 /* NB: Divide bytes by 4 to get the wchar_t count. */ 236 sarl $2, %ecx 237# endif 238 addq %rcx, %rdx 239# else 240 /* Align data to VEC_SIZE * 4 - 1 for loop. */ 241 incq %rdi 242 orq $(VEC_SIZE * 4 - 1), %rdi 243# endif 244 245 /* Compare 4 * VEC at a time forward. */ 246 .p2align 4 247L(loop_4x_vec): 248 VPCMPEQ 1(%rdi), %ymm0, %ymm1 249 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 250 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 251 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 252 vpor %ymm1, %ymm2, %ymm5 253 vpor %ymm3, %ymm4, %ymm6 254 vpor %ymm5, %ymm6, %ymm5 255 256 vpmovmskb %ymm5, %ecx 257# ifdef USE_AS_RAWMEMCHR 258 subq $-(VEC_SIZE * 4), %rdi 259 testl %ecx, %ecx 260 jz L(loop_4x_vec) 261# else 262 testl %ecx, %ecx 263 jnz L(loop_4x_vec_end) 264 265 subq $-(VEC_SIZE * 4), %rdi 266 267 subq $(CHAR_PER_VEC * 4), %rdx 268 ja L(loop_4x_vec) 269 270 /* Fall through into less than 4 remaining vectors of length 271 case. */ 272 VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 273 vpmovmskb %ymm1, %eax 274 .p2align 4 275L(last_4x_vec_or_less): 276# ifdef USE_AS_WMEMCHR 277 /* NB: Multiply length by 4 to get byte count. */ 278 sall $2, %edx 279# endif 280 /* Check if first VEC contained match. */ 281 testl %eax, %eax 282 jnz L(first_vec_x1_check) 283 284 /* If remaining length > VEC_SIZE * 2. */ 285 addl $(VEC_SIZE * 2), %edx 286 jg L(last_4x_vec) 287 288L(last_2x_vec): 289 /* If remaining length < VEC_SIZE. */ 290 addl $VEC_SIZE, %edx 291 jle L(zero_end) 292 293 /* Check VEC2 and compare any match with remaining length. */ 294 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 295 vpmovmskb %ymm1, %eax 296 tzcntl %eax, %eax 297 cmpl %eax, %edx 298 jbe L(set_zero_end) 299 addq $(VEC_SIZE + 1), %rdi 300 addq %rdi, %rax 301L(zero_end): 302 VZEROUPPER_RETURN 303 304 .p2align 4 305L(loop_4x_vec_end): 306# endif 307 /* rawmemchr will fall through into this if match was found in 308 loop. */ 309 310 vpmovmskb %ymm1, %eax 311 testl %eax, %eax 312 jnz L(last_vec_x1_return) 313 314 vpmovmskb %ymm2, %eax 315 testl %eax, %eax 316 jnz L(last_vec_x2_return) 317 318 vpmovmskb %ymm3, %eax 319 /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ 320 salq $32, %rcx 321 orq %rcx, %rax 322 tzcntq %rax, %rax 323# ifdef USE_AS_RAWMEMCHR 324 subq $(VEC_SIZE * 2 - 1), %rdi 325# else 326 subq $-(VEC_SIZE * 2 + 1), %rdi 327# endif 328 addq %rdi, %rax 329 VZEROUPPER_RETURN 330# ifndef USE_AS_RAWMEMCHR 331 332 .p2align 4 333L(first_vec_x1_check): 334 tzcntl %eax, %eax 335 /* Adjust length. */ 336 subl $-(VEC_SIZE * 4), %edx 337 /* Check if match within remaining length. */ 338 cmpl %eax, %edx 339 jbe L(set_zero_end) 340 incq %rdi 341 addq %rdi, %rax 342 VZEROUPPER_RETURN 343 .p2align 4 344L(set_zero_end): 345 xorl %eax, %eax 346 VZEROUPPER_RETURN 347# endif 348 349 .p2align 4 350L(last_vec_x1_return): 351 tzcntl %eax, %eax 352# ifdef USE_AS_RAWMEMCHR 353 subq $(VEC_SIZE * 4 - 1), %rdi 354# else 355 incq %rdi 356# endif 357 addq %rdi, %rax 358 VZEROUPPER_RETURN 359 360 .p2align 4 361L(last_vec_x2_return): 362 tzcntl %eax, %eax 363# ifdef USE_AS_RAWMEMCHR 364 subq $(VEC_SIZE * 3 - 1), %rdi 365# else 366 subq $-(VEC_SIZE + 1), %rdi 367# endif 368 addq %rdi, %rax 369 VZEROUPPER_RETURN 370 371# ifndef USE_AS_RAWMEMCHR 372 .p2align 4 373L(last_4x_vec_or_less_cmpeq): 374 VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 375 vpmovmskb %ymm1, %eax 376# ifdef USE_AS_WMEMCHR 377 /* NB: Multiply length by 4 to get byte count. */ 378 sall $2, %edx 379# endif 380 subq $-(VEC_SIZE * 4), %rdi 381 /* Check first VEC regardless. */ 382 testl %eax, %eax 383 jnz L(first_vec_x1_check) 384 385 /* If remaining length <= CHAR_PER_VEC * 2. */ 386 addl $(VEC_SIZE * 2), %edx 387 jle L(last_2x_vec) 388 .p2align 4 389L(last_4x_vec): 390 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 391 vpmovmskb %ymm1, %eax 392 testl %eax, %eax 393 jnz L(last_vec_x2_return) 394 395 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 396 vpmovmskb %ymm1, %eax 397 398 /* Create mask for possible matches within remaining length. */ 399 movq $-1, %rcx 400 bzhiq %rdx, %rcx, %rcx 401 402 /* Test matches in data against length match. */ 403 andl %ecx, %eax 404 jnz L(last_vec_x3) 405 406 /* if remaining length <= VEC_SIZE * 3 (Note this is after 407 remaining length was found to be > VEC_SIZE * 2. */ 408 subl $VEC_SIZE, %edx 409 jbe L(zero_end2) 410 411 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 412 vpmovmskb %ymm1, %eax 413 /* Shift remaining length mask for last VEC. */ 414 shrq $32, %rcx 415 andl %ecx, %eax 416 jz L(zero_end2) 417 tzcntl %eax, %eax 418 addq $(VEC_SIZE * 3 + 1), %rdi 419 addq %rdi, %rax 420L(zero_end2): 421 VZEROUPPER_RETURN 422 423 .p2align 4 424L(last_vec_x3): 425 tzcntl %eax, %eax 426 subq $-(VEC_SIZE * 2 + 1), %rdi 427 addq %rdi, %rax 428 VZEROUPPER_RETURN 429# endif 430 431END (MEMCHR) 432#endif 433