1/* wcsrchr with SSE2, without using bsf instructions. 2 Copyright (C) 2011-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20# include <sysdep.h> 21# define CFI_PUSH(REG) \ 22 cfi_adjust_cfa_offset (4); \ 23 cfi_rel_offset (REG, 0) 24 25# define CFI_POP(REG) \ 26 cfi_adjust_cfa_offset (-4); \ 27 cfi_restore (REG) 28 29# define PUSH(REG) pushl REG; CFI_PUSH (REG) 30# define POP(REG) popl REG; CFI_POP (REG) 31 32# define PARMS 8 33# define ENTRANCE PUSH (%edi); 34# define RETURN POP (%edi); ret; CFI_PUSH (%edi); 35# define STR1 PARMS 36# define STR2 STR1+4 37 38 atom_text_section 39ENTRY (__wcsrchr_sse2) 40 41 ENTRANCE 42 mov STR1(%esp), %ecx 43 movd STR2(%esp), %xmm1 44 45 mov %ecx, %edi 46 punpckldq %xmm1, %xmm1 47 pxor %xmm2, %xmm2 48 punpckldq %xmm1, %xmm1 49 50/* ECX has OFFSET. */ 51 and $63, %ecx 52 cmp $48, %ecx 53 ja L(crosscache) 54 55/* unaligned string. */ 56 movdqu (%edi), %xmm0 57 pcmpeqd %xmm0, %xmm2 58 pcmpeqd %xmm1, %xmm0 59/* Find where NULL is. */ 60 pmovmskb %xmm2, %ecx 61/* Check if there is a match. */ 62 pmovmskb %xmm0, %eax 63 add $16, %edi 64 65 test %eax, %eax 66 jnz L(unaligned_match1) 67 68 test %ecx, %ecx 69 jnz L(return_null) 70 71 and $-16, %edi 72 73 PUSH (%esi) 74 75 xor %edx, %edx 76 jmp L(loop) 77 78 CFI_POP (%esi) 79 80 .p2align 4 81L(unaligned_match1): 82 test %ecx, %ecx 83 jnz L(prolog_find_zero_1) 84 85 PUSH (%esi) 86 87/* Save current match */ 88 mov %eax, %edx 89 mov %edi, %esi 90 and $-16, %edi 91 jmp L(loop) 92 93 CFI_POP (%esi) 94 95 .p2align 4 96L(crosscache): 97/* Hancle unaligned string. */ 98 and $15, %ecx 99 and $-16, %edi 100 pxor %xmm3, %xmm3 101 movdqa (%edi), %xmm0 102 pcmpeqd %xmm0, %xmm3 103 pcmpeqd %xmm1, %xmm0 104/* Find where NULL is. */ 105 pmovmskb %xmm3, %edx 106/* Check if there is a match. */ 107 pmovmskb %xmm0, %eax 108/* Remove the leading bytes. */ 109 shr %cl, %edx 110 shr %cl, %eax 111 add $16, %edi 112 113 test %eax, %eax 114 jnz L(unaligned_match) 115 116 test %edx, %edx 117 jnz L(return_null) 118 119 PUSH (%esi) 120 121 xor %edx, %edx 122 jmp L(loop) 123 124 CFI_POP (%esi) 125 126 .p2align 4 127L(unaligned_match): 128 test %edx, %edx 129 jnz L(prolog_find_zero) 130 131 PUSH (%esi) 132 133 mov %eax, %edx 134 lea (%edi, %ecx), %esi 135 136/* Loop start on aligned string. */ 137 .p2align 4 138L(loop): 139 movdqa (%edi), %xmm0 140 pcmpeqd %xmm0, %xmm2 141 add $16, %edi 142 pcmpeqd %xmm1, %xmm0 143 pmovmskb %xmm2, %ecx 144 pmovmskb %xmm0, %eax 145 or %eax, %ecx 146 jnz L(matches) 147 148 movdqa (%edi), %xmm3 149 pcmpeqd %xmm3, %xmm2 150 add $16, %edi 151 pcmpeqd %xmm1, %xmm3 152 pmovmskb %xmm2, %ecx 153 pmovmskb %xmm3, %eax 154 or %eax, %ecx 155 jnz L(matches) 156 157 movdqa (%edi), %xmm4 158 pcmpeqd %xmm4, %xmm2 159 add $16, %edi 160 pcmpeqd %xmm1, %xmm4 161 pmovmskb %xmm2, %ecx 162 pmovmskb %xmm4, %eax 163 or %eax, %ecx 164 jnz L(matches) 165 166 movdqa (%edi), %xmm5 167 pcmpeqd %xmm5, %xmm2 168 add $16, %edi 169 pcmpeqd %xmm1, %xmm5 170 pmovmskb %xmm2, %ecx 171 pmovmskb %xmm5, %eax 172 or %eax, %ecx 173 jz L(loop) 174 175 .p2align 4 176L(matches): 177 test %eax, %eax 178 jnz L(match) 179L(return_value): 180 test %edx, %edx 181 jz L(return_null_1) 182 mov %edx, %eax 183 mov %esi, %edi 184 185 POP (%esi) 186 187 test %ah, %ah 188 jnz L(match_third_or_fourth_wchar) 189 test $15 << 4, %al 190 jnz L(match_second_wchar) 191 lea -16(%edi), %eax 192 RETURN 193 194 CFI_PUSH (%esi) 195 196 .p2align 4 197L(return_null_1): 198 POP (%esi) 199 200 xor %eax, %eax 201 RETURN 202 203 CFI_PUSH (%esi) 204 205 .p2align 4 206L(match): 207 pmovmskb %xmm2, %ecx 208 test %ecx, %ecx 209 jnz L(find_zero) 210/* save match info */ 211 mov %eax, %edx 212 mov %edi, %esi 213 jmp L(loop) 214 215 .p2align 4 216L(find_zero): 217 test %cl, %cl 218 jz L(find_zero_in_third_or_fourth_wchar) 219 test $15, %cl 220 jz L(find_zero_in_second_wchar) 221 and $1, %eax 222 jz L(return_value) 223 224 POP (%esi) 225 226 lea -16(%edi), %eax 227 RETURN 228 229 CFI_PUSH (%esi) 230 231 .p2align 4 232L(find_zero_in_second_wchar): 233 and $1 << 5 - 1, %eax 234 jz L(return_value) 235 236 POP (%esi) 237 238 test $15 << 4, %al 239 jnz L(match_second_wchar) 240 lea -16(%edi), %eax 241 RETURN 242 243 CFI_PUSH (%esi) 244 245 .p2align 4 246L(find_zero_in_third_or_fourth_wchar): 247 test $15, %ch 248 jz L(find_zero_in_fourth_wchar) 249 and $1 << 9 - 1, %eax 250 jz L(return_value) 251 252 POP (%esi) 253 254 test %ah, %ah 255 jnz L(match_third_wchar) 256 test $15 << 4, %al 257 jnz L(match_second_wchar) 258 lea -16(%edi), %eax 259 RETURN 260 261 CFI_PUSH (%esi) 262 263 .p2align 4 264L(find_zero_in_fourth_wchar): 265 266 POP (%esi) 267 268 test %ah, %ah 269 jnz L(match_third_or_fourth_wchar) 270 test $15 << 4, %al 271 jnz L(match_second_wchar) 272 lea -16(%edi), %eax 273 RETURN 274 275 CFI_PUSH (%esi) 276 277 .p2align 4 278L(match_second_wchar): 279 lea -12(%edi), %eax 280 RETURN 281 282 .p2align 4 283L(match_third_or_fourth_wchar): 284 test $15 << 4, %ah 285 jnz L(match_fourth_wchar) 286 lea -8(%edi), %eax 287 RETURN 288 289 .p2align 4 290L(match_third_wchar): 291 lea -8(%edi), %eax 292 RETURN 293 294 .p2align 4 295L(match_fourth_wchar): 296 lea -4(%edi), %eax 297 RETURN 298 299 .p2align 4 300L(return_null): 301 xor %eax, %eax 302 RETURN 303 304 .p2align 4 305L(prolog_find_zero): 306 add %ecx, %edi 307 mov %edx, %ecx 308L(prolog_find_zero_1): 309 test %cl, %cl 310 jz L(prolog_find_zero_in_third_or_fourth_wchar) 311 test $15, %cl 312 jz L(prolog_find_zero_in_second_wchar) 313 and $1, %eax 314 jz L(return_null) 315 316 lea -16(%edi), %eax 317 RETURN 318 319 .p2align 4 320L(prolog_find_zero_in_second_wchar): 321 and $1 << 5 - 1, %eax 322 jz L(return_null) 323 324 test $15 << 4, %al 325 jnz L(match_second_wchar) 326 lea -16(%edi), %eax 327 RETURN 328 329 .p2align 4 330L(prolog_find_zero_in_third_or_fourth_wchar): 331 test $15, %ch 332 jz L(prolog_find_zero_in_fourth_wchar) 333 and $1 << 9 - 1, %eax 334 jz L(return_null) 335 336 test %ah, %ah 337 jnz L(match_third_wchar) 338 test $15 << 4, %al 339 jnz L(match_second_wchar) 340 lea -16(%edi), %eax 341 RETURN 342 343 .p2align 4 344L(prolog_find_zero_in_fourth_wchar): 345 test %ah, %ah 346 jnz L(match_third_or_fourth_wchar) 347 test $15 << 4, %al 348 jnz L(match_second_wchar) 349 lea -16(%edi), %eax 350 RETURN 351 352END (__wcsrchr_sse2) 353#endif 354