1/* memcmp/wmemcmp optimized with AVX2.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21/* memcmp/wmemcmp is implemented as:
22   1. Use ymm vector compares when possible. The only case where
23      vector compares is not possible for when size < VEC_SIZE
24      and loading from either s1 or s2 would cause a page cross.
25   2. For size from 2 to 7 bytes on page cross, load as big endian
26      with movbe and bswap to avoid branches.
27   3. Use xmm vector compare when size >= 4 bytes for memcmp or
28      size >= 8 bytes for wmemcmp.
29   4. Optimistically compare up to first 4 * VEC_SIZE one at a
30      to check for early mismatches. Only do this if its guranteed the
31      work is not wasted.
32   5. If size is 8 * VEC_SIZE or less, unroll the loop.
33   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
34      area.
35   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
36   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
37   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
38
39
40# include <sysdep.h>
41
42# ifndef MEMCMP
43#  define MEMCMP	__memcmp_avx2_movbe
44# endif
45
46# ifdef USE_AS_WMEMCMP
47#  define CHAR_SIZE	4
48#  define VPCMPEQ	vpcmpeqd
49# else
50#  define CHAR_SIZE	1
51#  define VPCMPEQ	vpcmpeqb
52# endif
53
54# ifndef VZEROUPPER
55#  define VZEROUPPER	vzeroupper
56# endif
57
58# ifndef SECTION
59#  define SECTION(p)	p##.avx
60# endif
61
62# define VEC_SIZE 32
63# define PAGE_SIZE	4096
64
65/* Warning!
66           wmemcmp has to use SIGNED comparison for elements.
67           memcmp has to use UNSIGNED comparison for elemnts.
68*/
69
70	.section SECTION(.text),"ax",@progbits
71ENTRY (MEMCMP)
72# ifdef USE_AS_WMEMCMP
73	shl	$2, %RDX_LP
74# elif defined __ILP32__
75	/* Clear the upper 32 bits.  */
76	movl	%edx, %edx
77# endif
78	cmp	$VEC_SIZE, %RDX_LP
79	jb	L(less_vec)
80
81	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
82	vmovdqu	(%rsi), %ymm1
83	VPCMPEQ	(%rdi), %ymm1, %ymm1
84	vpmovmskb %ymm1, %eax
85	/* NB: eax must be destination register if going to
86	   L(return_vec_[0,2]). For L(return_vec_3 destination register
87	   must be ecx.  */
88	incl	%eax
89	jnz	L(return_vec_0)
90
91	cmpq	$(VEC_SIZE * 2), %rdx
92	jbe	L(last_1x_vec)
93
94	/* Check second VEC no matter what.  */
95	vmovdqu	VEC_SIZE(%rsi), %ymm2
96	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
97	vpmovmskb %ymm2, %eax
98	/* If all 4 VEC where equal eax will be all 1s so incl will
99	   overflow and set zero flag.  */
100	incl	%eax
101	jnz	L(return_vec_1)
102
103	/* Less than 4 * VEC.  */
104	cmpq	$(VEC_SIZE * 4), %rdx
105	jbe	L(last_2x_vec)
106
107	/* Check third and fourth VEC no matter what.  */
108	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
109	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
110	vpmovmskb %ymm3, %eax
111	incl	%eax
112	jnz	L(return_vec_2)
113	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
114	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
115	vpmovmskb %ymm4, %ecx
116	incl	%ecx
117	jnz	L(return_vec_3)
118
119	/* Go to 4x VEC loop.  */
120	cmpq	$(VEC_SIZE * 8), %rdx
121	ja	L(more_8x_vec)
122
123	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
124	   branches.  */
125
126	/* Load first two VEC from s2 before adjusting addresses.  */
127	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
128	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
129	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
130	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
131
132	/* Wait to load from s1 until addressed adjust due to
133	   unlamination of microfusion with complex address mode.  */
134	VPCMPEQ	(%rdi), %ymm1, %ymm1
135	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
136
137	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
138	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
139	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
140	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
141
142	/* Reduce VEC0 - VEC4.  */
143	vpand	%ymm1, %ymm2, %ymm5
144	vpand	%ymm3, %ymm4, %ymm6
145	vpand	%ymm5, %ymm6, %ymm7
146	vpmovmskb %ymm7, %ecx
147	incl	%ecx
148	jnz	L(return_vec_0_1_2_3)
149	/* NB: eax must be zero to reach here.  */
150	VZEROUPPER_RETURN
151
152	.p2align 4
153L(return_vec_0):
154	tzcntl	%eax, %eax
155# ifdef USE_AS_WMEMCMP
156	movl	(%rdi, %rax), %ecx
157	xorl	%edx, %edx
158	cmpl	(%rsi, %rax), %ecx
159	/* NB: no partial register stall here because xorl zero idiom
160	   above.  */
161	setg	%dl
162	leal	-1(%rdx, %rdx), %eax
163# else
164	movzbl	(%rsi, %rax), %ecx
165	movzbl	(%rdi, %rax), %eax
166	subl	%ecx, %eax
167# endif
168L(return_vzeroupper):
169	ZERO_UPPER_VEC_REGISTERS_RETURN
170
171	.p2align 4
172L(return_vec_1):
173	tzcntl	%eax, %eax
174# ifdef USE_AS_WMEMCMP
175	movl	VEC_SIZE(%rdi, %rax), %ecx
176	xorl	%edx, %edx
177	cmpl	VEC_SIZE(%rsi, %rax), %ecx
178	setg	%dl
179	leal	-1(%rdx, %rdx), %eax
180# else
181	movzbl	VEC_SIZE(%rsi, %rax), %ecx
182	movzbl	VEC_SIZE(%rdi, %rax), %eax
183	subl	%ecx, %eax
184# endif
185	VZEROUPPER_RETURN
186
187	.p2align 4
188L(return_vec_2):
189	tzcntl	%eax, %eax
190# ifdef USE_AS_WMEMCMP
191	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
192	xorl	%edx, %edx
193	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
194	setg	%dl
195	leal	-1(%rdx, %rdx), %eax
196# else
197	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
198	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
199	subl	%ecx, %eax
200# endif
201	VZEROUPPER_RETURN
202
203	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
204	.p2align 5
205L(8x_return_vec_0_1_2_3):
206	/* Returning from L(more_8x_vec) requires restoring rsi.  */
207	addq	%rdi, %rsi
208L(return_vec_0_1_2_3):
209	vpmovmskb %ymm1, %eax
210	incl	%eax
211	jnz	L(return_vec_0)
212
213	vpmovmskb %ymm2, %eax
214	incl	%eax
215	jnz	L(return_vec_1)
216
217	vpmovmskb %ymm3, %eax
218	incl	%eax
219	jnz	L(return_vec_2)
220L(return_vec_3):
221	tzcntl	%ecx, %ecx
222# ifdef USE_AS_WMEMCMP
223	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
224	xorl	%edx, %edx
225	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
226	setg	%dl
227	leal	-1(%rdx, %rdx), %eax
228# else
229	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
230	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
231	subl	%ecx, %eax
232# endif
233	VZEROUPPER_RETURN
234
235	.p2align 4
236L(more_8x_vec):
237	/* Set end of s1 in rdx.  */
238	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
239	/* rsi stores s2 - s1. This allows loop to only update one
240	   pointer.  */
241	subq	%rdi, %rsi
242	/* Align s1 pointer.  */
243	andq	$-VEC_SIZE, %rdi
244	/* Adjust because first 4x vec where check already.  */
245	subq	$-(VEC_SIZE * 4), %rdi
246	.p2align 4
247L(loop_4x_vec):
248	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
249	 */
250	vmovdqu	(%rsi, %rdi), %ymm1
251	VPCMPEQ	(%rdi), %ymm1, %ymm1
252
253	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
254	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
255
256	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
257	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
258
259	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
260	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
261
262	vpand	%ymm1, %ymm2, %ymm5
263	vpand	%ymm3, %ymm4, %ymm6
264	vpand	%ymm5, %ymm6, %ymm7
265	vpmovmskb %ymm7, %ecx
266	incl	%ecx
267	jnz	L(8x_return_vec_0_1_2_3)
268	subq	$-(VEC_SIZE * 4), %rdi
269	/* Check if s1 pointer at end.  */
270	cmpq	%rdx, %rdi
271	jb	L(loop_4x_vec)
272
273	subq	%rdx, %rdi
274	/* rdi has 4 * VEC_SIZE - remaining length.  */
275	cmpl	$(VEC_SIZE * 3), %edi
276	jae	L(8x_last_1x_vec)
277	/* Load regardless of branch.  */
278	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
279	cmpl	$(VEC_SIZE * 2), %edi
280	jae	L(8x_last_2x_vec)
281
282	/* Check last 4 VEC.  */
283	vmovdqu	(%rsi, %rdx), %ymm1
284	VPCMPEQ	(%rdx), %ymm1, %ymm1
285
286	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
287	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
288
289	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
290
291	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
292	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
293
294	vpand	%ymm1, %ymm2, %ymm5
295	vpand	%ymm3, %ymm4, %ymm6
296	vpand	%ymm5, %ymm6, %ymm7
297	vpmovmskb %ymm7, %ecx
298	/* Restore s1 pointer to rdi.  */
299	movq	%rdx, %rdi
300	incl	%ecx
301	jnz	L(8x_return_vec_0_1_2_3)
302	/* NB: eax must be zero to reach here.  */
303	VZEROUPPER_RETURN
304
305	/* Only entry is from L(more_8x_vec).  */
306	.p2align 4
307L(8x_last_2x_vec):
308	/* Check second to last VEC. rdx store end pointer of s1 and
309	   ymm3 has already been loaded with second to last VEC from s2.
310	 */
311	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
312	vpmovmskb %ymm3, %eax
313	incl	%eax
314	jnz	L(8x_return_vec_2)
315	/* Check last VEC.  */
316	.p2align 4
317L(8x_last_1x_vec):
318	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
319	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
320	vpmovmskb %ymm4, %eax
321	incl	%eax
322	jnz	L(8x_return_vec_3)
323	VZEROUPPER_RETURN
324
325	.p2align 4
326L(last_2x_vec):
327	/* Check second to last VEC.  */
328	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
329	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
330	vpmovmskb %ymm1, %eax
331	incl	%eax
332	jnz	L(return_vec_1_end)
333	/* Check last VEC.  */
334L(last_1x_vec):
335	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
336	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
337	vpmovmskb %ymm1, %eax
338	incl	%eax
339	jnz	L(return_vec_0_end)
340	VZEROUPPER_RETURN
341
342	.p2align 4
343L(8x_return_vec_2):
344	subq	$VEC_SIZE, %rdx
345L(8x_return_vec_3):
346	tzcntl	%eax, %eax
347	addq	%rdx, %rax
348# ifdef USE_AS_WMEMCMP
349	movl	(VEC_SIZE * 3)(%rax), %ecx
350	xorl	%edx, %edx
351	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
352	setg	%dl
353	leal	-1(%rdx, %rdx), %eax
354# else
355	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
356	movzbl	(VEC_SIZE * 3)(%rax), %eax
357	subl	%ecx, %eax
358# endif
359	VZEROUPPER_RETURN
360
361	.p2align 4
362L(return_vec_1_end):
363	tzcntl	%eax, %eax
364	addl	%edx, %eax
365# ifdef USE_AS_WMEMCMP
366	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
367	xorl	%edx, %edx
368	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
369	setg	%dl
370	leal	-1(%rdx, %rdx), %eax
371# else
372	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
373	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
374	subl	%ecx, %eax
375# endif
376	VZEROUPPER_RETURN
377
378	.p2align 4
379L(return_vec_0_end):
380	tzcntl	%eax, %eax
381	addl	%edx, %eax
382# ifdef USE_AS_WMEMCMP
383	movl	-VEC_SIZE(%rdi, %rax), %ecx
384	xorl	%edx, %edx
385	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
386	setg	%dl
387	leal	-1(%rdx, %rdx), %eax
388# else
389	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
390	movzbl	-VEC_SIZE(%rdi, %rax), %eax
391	subl	%ecx, %eax
392# endif
393	VZEROUPPER_RETURN
394
395	.p2align 4
396L(less_vec):
397	/* Check if one or less CHAR. This is necessary for size = 0 but
398	   is also faster for size = CHAR_SIZE.  */
399	cmpl	$CHAR_SIZE, %edx
400	jbe	L(one_or_less)
401
402	/* Check if loading one VEC from either s1 or s2 could cause a
403	   page cross. This can have false positives but is by far the
404	   fastest method.  */
405	movl	%edi, %eax
406	orl	%esi, %eax
407	andl	$(PAGE_SIZE - 1), %eax
408	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
409	jg	L(page_cross_less_vec)
410
411	/* No page cross possible.  */
412	vmovdqu	(%rsi), %ymm2
413	VPCMPEQ	(%rdi), %ymm2, %ymm2
414	vpmovmskb %ymm2, %eax
415	incl	%eax
416	/* Result will be zero if s1 and s2 match. Otherwise first set
417	   bit will be first mismatch.  */
418	bzhil	%edx, %eax, %edx
419	jnz	L(return_vec_0)
420	xorl	%eax, %eax
421	VZEROUPPER_RETURN
422
423	.p2align 4
424L(page_cross_less_vec):
425	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
426	   bytes.  */
427	cmpl	$16, %edx
428	jae	L(between_16_31)
429# ifndef USE_AS_WMEMCMP
430	cmpl	$8, %edx
431	jae	L(between_8_15)
432	cmpl	$4, %edx
433	jae	L(between_4_7)
434
435	/* Load as big endian to avoid branches.  */
436	movzwl	(%rdi), %eax
437	movzwl	(%rsi), %ecx
438	shll	$8, %eax
439	shll	$8, %ecx
440	bswap	%eax
441	bswap	%ecx
442	movzbl	-1(%rdi, %rdx), %edi
443	movzbl	-1(%rsi, %rdx), %esi
444	orl	%edi, %eax
445	orl	%esi, %ecx
446	/* Subtraction is okay because the upper 8 bits are zero.  */
447	subl	%ecx, %eax
448	/* No ymm register was touched.  */
449	ret
450
451	.p2align 4
452L(one_or_less):
453	jb	L(zero)
454	movzbl	(%rsi), %ecx
455	movzbl	(%rdi), %eax
456	subl	%ecx, %eax
457	/* No ymm register was touched.  */
458	ret
459
460	.p2align 4
461L(between_8_15):
462# endif
463	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
464	vmovq	(%rdi), %xmm1
465	vmovq	(%rsi), %xmm2
466	VPCMPEQ	%xmm1, %xmm2, %xmm2
467	vpmovmskb %xmm2, %eax
468	subl	$0xffff, %eax
469	jnz	L(return_vec_0)
470	/* Use overlapping loads to avoid branches.  */
471	leaq	-8(%rdi, %rdx), %rdi
472	leaq	-8(%rsi, %rdx), %rsi
473	vmovq	(%rdi), %xmm1
474	vmovq	(%rsi), %xmm2
475	VPCMPEQ	%xmm1, %xmm2, %xmm2
476	vpmovmskb %xmm2, %eax
477	subl	$0xffff, %eax
478	jnz	L(return_vec_0)
479	/* No ymm register was touched.  */
480	ret
481
482	.p2align 4
483L(zero):
484	xorl	%eax, %eax
485	ret
486
487	.p2align 4
488L(between_16_31):
489	/* From 16 to 31 bytes.  No branch when size == 16.  */
490	vmovdqu	(%rsi), %xmm2
491	VPCMPEQ	(%rdi), %xmm2, %xmm2
492	vpmovmskb %xmm2, %eax
493	subl	$0xffff, %eax
494	jnz	L(return_vec_0)
495
496	/* Use overlapping loads to avoid branches.  */
497
498	vmovdqu	-16(%rsi, %rdx), %xmm2
499	leaq	-16(%rdi, %rdx), %rdi
500	leaq	-16(%rsi, %rdx), %rsi
501	VPCMPEQ	(%rdi), %xmm2, %xmm2
502	vpmovmskb %xmm2, %eax
503	subl	$0xffff, %eax
504	jnz	L(return_vec_0)
505	/* No ymm register was touched.  */
506	ret
507
508# ifdef USE_AS_WMEMCMP
509	.p2align 4
510L(one_or_less):
511	jb	L(zero)
512	movl	(%rdi), %ecx
513	xorl	%edx, %edx
514	cmpl	(%rsi), %ecx
515	je	L(zero)
516	setg	%dl
517	leal	-1(%rdx, %rdx), %eax
518	/* No ymm register was touched.  */
519	ret
520# else
521
522	.p2align 4
523L(between_4_7):
524	/* Load as big endian with overlapping movbe to avoid branches.
525	 */
526	movbe	(%rdi), %eax
527	movbe	(%rsi), %ecx
528	shlq	$32, %rax
529	shlq	$32, %rcx
530	movbe	-4(%rdi, %rdx), %edi
531	movbe	-4(%rsi, %rdx), %esi
532	orq	%rdi, %rax
533	orq	%rsi, %rcx
534	subq	%rcx, %rax
535	jz	L(zero_4_7)
536	sbbl	%eax, %eax
537	orl	$1, %eax
538L(zero_4_7):
539	/* No ymm register was touched.  */
540	ret
541# endif
542
543END (MEMCMP)
544#endif
545