1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21/* memcmp/wmemcmp is implemented as: 22 1. Use ymm vector compares when possible. The only case where 23 vector compares is not possible for when size < CHAR_PER_VEC 24 and loading from either s1 or s2 would cause a page cross. 25 2. For size from 2 to 7 bytes on page cross, load as big endian 26 with movbe and bswap to avoid branches. 27 3. Use xmm vector compare when size >= 4 bytes for memcmp or 28 size >= 8 bytes for wmemcmp. 29 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a 30 to check for early mismatches. Only do this if its guranteed the 31 work is not wasted. 32 5. If size is 8 * VEC_SIZE or less, unroll the loop. 33 6. Compare 4 * VEC_SIZE at a time with the aligned first memory 34 area. 35 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. 36 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. 37 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. 38 39When possible the implementation tries to optimize for frontend in the 40following ways: 41Throughput: 42 1. All code sections that fit are able to run optimally out of the 43 LSD. 44 2. All code sections that fit are able to run optimally out of the 45 DSB 46 3. Basic blocks are contained in minimum number of fetch blocks 47 necessary. 48 49Latency: 50 1. Logically connected basic blocks are put in the same 51 cache-line. 52 2. Logically connected basic blocks that do not fit in the same 53 cache-line are put in adjacent lines. This can get beneficial 54 L2 spatial prefetching and L1 next-line prefetching. */ 55 56# include <sysdep.h> 57 58# ifndef MEMCMP 59# define MEMCMP __memcmp_evex_movbe 60# endif 61 62# define VMOVU vmovdqu64 63 64# ifdef USE_AS_WMEMCMP 65# define VMOVU_MASK vmovdqu32 66# define CHAR_SIZE 4 67# define VPCMP vpcmpd 68# define VPTEST vptestmd 69# else 70# define VMOVU_MASK vmovdqu8 71# define CHAR_SIZE 1 72# define VPCMP vpcmpub 73# define VPTEST vptestmb 74# endif 75 76 77# define VEC_SIZE 32 78# define PAGE_SIZE 4096 79# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 80 81# define XMM0 xmm16 82# define XMM1 xmm17 83# define XMM2 xmm18 84# define YMM0 ymm16 85# define XMM1 xmm17 86# define XMM2 xmm18 87# define YMM1 ymm17 88# define YMM2 ymm18 89# define YMM3 ymm19 90# define YMM4 ymm20 91# define YMM5 ymm21 92# define YMM6 ymm22 93 94/* Warning! 95 wmemcmp has to use SIGNED comparison for elements. 96 memcmp has to use UNSIGNED comparison for elemnts. 97*/ 98 99 .section .text.evex,"ax",@progbits 100/* Cache align memcmp entry. This allows for much more thorough 101 frontend optimization. */ 102ENTRY_P2ALIGN (MEMCMP, 6) 103# ifdef __ILP32__ 104 /* Clear the upper 32 bits. */ 105 movl %edx, %edx 106# endif 107 cmp $CHAR_PER_VEC, %RDX_LP 108 /* Fall through for [0, VEC_SIZE] as its the hottest. */ 109 ja L(more_1x_vec) 110 111 /* Create mask for CHAR's we want to compare. This allows us to 112 avoid having to include page cross logic. */ 113 movl $-1, %ecx 114 bzhil %edx, %ecx, %ecx 115 kmovd %ecx, %k2 116 117 /* Safe to load full ymm with mask. */ 118 VMOVU_MASK (%rsi), %YMM2{%k2} 119 VPCMP $4,(%rdi), %YMM2, %k1{%k2} 120 kmovd %k1, %eax 121 testl %eax, %eax 122 jnz L(return_vec_0) 123 ret 124 125 .p2align 4 126L(return_vec_0): 127 tzcntl %eax, %eax 128# ifdef USE_AS_WMEMCMP 129 movl (%rdi, %rax, CHAR_SIZE), %ecx 130 xorl %edx, %edx 131 cmpl (%rsi, %rax, CHAR_SIZE), %ecx 132 /* NB: no partial register stall here because xorl zero idiom 133 above. */ 134 setg %dl 135 leal -1(%rdx, %rdx), %eax 136# else 137 movzbl (%rsi, %rax), %ecx 138 movzbl (%rdi, %rax), %eax 139 subl %ecx, %eax 140# endif 141 ret 142 143 144 .p2align 4 145L(more_1x_vec): 146 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ 147 VMOVU (%rsi), %YMM1 148 /* Use compare not equals to directly check for mismatch. */ 149 VPCMP $4,(%rdi), %YMM1, %k1 150 kmovd %k1, %eax 151 /* NB: eax must be destination register if going to 152 L(return_vec_[0,2]). For L(return_vec_3) destination register 153 must be ecx. */ 154 testl %eax, %eax 155 jnz L(return_vec_0) 156 157 cmpq $(CHAR_PER_VEC * 2), %rdx 158 jbe L(last_1x_vec) 159 160 /* Check second VEC no matter what. */ 161 VMOVU VEC_SIZE(%rsi), %YMM2 162 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 163 kmovd %k1, %eax 164 testl %eax, %eax 165 jnz L(return_vec_1) 166 167 /* Less than 4 * VEC. */ 168 cmpq $(CHAR_PER_VEC * 4), %rdx 169 jbe L(last_2x_vec) 170 171 /* Check third and fourth VEC no matter what. */ 172 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 173 VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 174 kmovd %k1, %eax 175 testl %eax, %eax 176 jnz L(return_vec_2) 177 178 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 179 VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 180 kmovd %k1, %ecx 181 testl %ecx, %ecx 182 jnz L(return_vec_3) 183 184 /* Go to 4x VEC loop. */ 185 cmpq $(CHAR_PER_VEC * 8), %rdx 186 ja L(more_8x_vec) 187 188 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any 189 branches. */ 190 191 /* Load first two VEC from s2 before adjusting addresses. */ 192 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 193 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 194 leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi 195 leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi 196 197 /* Wait to load from s1 until addressed adjust due to 198 unlamination of microfusion with complex address mode. */ 199 200 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it 201 will have some 1s. */ 202 vpxorq (%rdi), %YMM1, %YMM1 203 vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 204 205 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 206 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 207 208 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 209 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while 210 oring with YMM1. Result is stored in YMM4. */ 211 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 212 213 /* Or together YMM2, YMM3, and YMM4 into YMM4. */ 214 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 215 216 /* Test YMM4 against itself. Store any CHAR mismatches in k1. 217 */ 218 VPTEST %YMM4, %YMM4, %k1 219 /* k1 must go to ecx for L(return_vec_0_1_2_3). */ 220 kmovd %k1, %ecx 221 testl %ecx, %ecx 222 jnz L(return_vec_0_1_2_3) 223 /* NB: eax must be zero to reach here. */ 224 ret 225 226 227 .p2align 4,, 8 228L(8x_end_return_vec_0_1_2_3): 229 movq %rdx, %rdi 230L(8x_return_vec_0_1_2_3): 231 addq %rdi, %rsi 232L(return_vec_0_1_2_3): 233 VPTEST %YMM1, %YMM1, %k0 234 kmovd %k0, %eax 235 testl %eax, %eax 236 jnz L(return_vec_0) 237 238 VPTEST %YMM2, %YMM2, %k0 239 kmovd %k0, %eax 240 testl %eax, %eax 241 jnz L(return_vec_1) 242 243 VPTEST %YMM3, %YMM3, %k0 244 kmovd %k0, %eax 245 testl %eax, %eax 246 jnz L(return_vec_2) 247L(return_vec_3): 248 /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one 249 fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache 250 line. */ 251 bsfl %ecx, %ecx 252# ifdef USE_AS_WMEMCMP 253 movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax 254 xorl %edx, %edx 255 cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax 256 setg %dl 257 leal -1(%rdx, %rdx), %eax 258# else 259 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax 260 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx 261 subl %ecx, %eax 262# endif 263 ret 264 265 266 .p2align 4 267L(return_vec_1): 268 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one 269 fetch block. */ 270 bsfl %eax, %eax 271# ifdef USE_AS_WMEMCMP 272 movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx 273 xorl %edx, %edx 274 cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx 275 setg %dl 276 leal -1(%rdx, %rdx), %eax 277# else 278 movzbl VEC_SIZE(%rsi, %rax), %ecx 279 movzbl VEC_SIZE(%rdi, %rax), %eax 280 subl %ecx, %eax 281# endif 282 ret 283 284 .p2align 4,, 10 285L(return_vec_2): 286 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one 287 fetch block. */ 288 bsfl %eax, %eax 289# ifdef USE_AS_WMEMCMP 290 movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx 291 xorl %edx, %edx 292 cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx 293 setg %dl 294 leal -1(%rdx, %rdx), %eax 295# else 296 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx 297 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax 298 subl %ecx, %eax 299# endif 300 ret 301 302 .p2align 4 303L(more_8x_vec): 304 /* Set end of s1 in rdx. */ 305 leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx 306 /* rsi stores s2 - s1. This allows loop to only update one 307 pointer. */ 308 subq %rdi, %rsi 309 /* Align s1 pointer. */ 310 andq $-VEC_SIZE, %rdi 311 /* Adjust because first 4x vec where check already. */ 312 subq $-(VEC_SIZE * 4), %rdi 313 314 .p2align 4 315L(loop_4x_vec): 316 VMOVU (%rsi, %rdi), %YMM1 317 vpxorq (%rdi), %YMM1, %YMM1 318 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 319 vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 320 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 321 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 322 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 323 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 324 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 325 VPTEST %YMM4, %YMM4, %k1 326 kmovd %k1, %ecx 327 testl %ecx, %ecx 328 jnz L(8x_return_vec_0_1_2_3) 329 subq $-(VEC_SIZE * 4), %rdi 330 cmpq %rdx, %rdi 331 jb L(loop_4x_vec) 332 333 subq %rdx, %rdi 334 /* rdi has 4 * VEC_SIZE - remaining length. */ 335 cmpl $(VEC_SIZE * 3), %edi 336 jae L(8x_last_1x_vec) 337 /* Load regardless of branch. */ 338 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 339 cmpl $(VEC_SIZE * 2), %edi 340 jae L(8x_last_2x_vec) 341 342 vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 343 344 VMOVU (%rsi, %rdx), %YMM1 345 vpxorq (%rdx), %YMM1, %YMM1 346 347 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 348 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 349 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 350 vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 351 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 352 VPTEST %YMM4, %YMM4, %k1 353 kmovd %k1, %ecx 354 testl %ecx, %ecx 355 jnz L(8x_end_return_vec_0_1_2_3) 356 /* NB: eax must be zero to reach here. */ 357 ret 358 359 /* Only entry is from L(more_8x_vec). */ 360 .p2align 4,, 10 361L(8x_last_2x_vec): 362 VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 363 kmovd %k1, %eax 364 testl %eax, %eax 365 jnz L(8x_return_vec_2) 366 /* Naturally aligned to 16 bytes. */ 367L(8x_last_1x_vec): 368 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 369 VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 370 kmovd %k1, %eax 371 testl %eax, %eax 372 jnz L(8x_return_vec_3) 373 ret 374 375 /* Not ideally aligned (at offset +9 bytes in fetch block) but 376 not aligning keeps it in the same cache line as 377 L(8x_last_1x/2x_vec) so likely worth it. As well, saves code 378 size. */ 379 .p2align 4,, 4 380L(8x_return_vec_2): 381 subq $VEC_SIZE, %rdx 382L(8x_return_vec_3): 383 bsfl %eax, %eax 384# ifdef USE_AS_WMEMCMP 385 leaq (%rdx, %rax, CHAR_SIZE), %rax 386 movl (VEC_SIZE * 3)(%rax), %ecx 387 xorl %edx, %edx 388 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx 389 setg %dl 390 leal -1(%rdx, %rdx), %eax 391# else 392 addq %rdx, %rax 393 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx 394 movzbl (VEC_SIZE * 3)(%rax), %eax 395 subl %ecx, %eax 396# endif 397 ret 398 399 .p2align 4,, 10 400L(last_2x_vec): 401 /* Check second to last VEC. */ 402 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 403 VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 404 kmovd %k1, %eax 405 testl %eax, %eax 406 jnz L(return_vec_1_end) 407 408 /* Check last VEC. */ 409 .p2align 4 410L(last_1x_vec): 411 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 412 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 413 kmovd %k1, %eax 414 testl %eax, %eax 415 jnz L(return_vec_0_end) 416 ret 417 418 419 /* Don't align. Takes 2-fetch blocks either way and aligning 420 will cause code to spill into another cacheline. */ 421L(return_vec_1_end): 422 /* Use bsf to save code size. This is necessary to have 423 L(one_or_less) fit in aligning bytes between. */ 424 bsfl %eax, %eax 425 addl %edx, %eax 426# ifdef USE_AS_WMEMCMP 427 movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx 428 xorl %edx, %edx 429 cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx 430 setg %dl 431 leal -1(%rdx, %rdx), %eax 432# else 433 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx 434 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax 435 subl %ecx, %eax 436# endif 437 ret 438 439 /* Don't align. Takes 2-fetch blocks either way and aligning 440 will cause code to spill into another cacheline. */ 441L(return_vec_0_end): 442 tzcntl %eax, %eax 443 addl %edx, %eax 444# ifdef USE_AS_WMEMCMP 445 movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx 446 xorl %edx, %edx 447 cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx 448 setg %dl 449 leal -1(%rdx, %rdx), %eax 450# else 451 movzbl -VEC_SIZE(%rsi, %rax), %ecx 452 movzbl -VEC_SIZE(%rdi, %rax), %eax 453 subl %ecx, %eax 454# endif 455 ret 456 /* 1-byte until next cache line. */ 457 458END (MEMCMP) 459#endif 460