1/* memcmp with SSE4.1, wmemcmp with SSE4.1
2   Copyright (C) 2010-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24#  define MEMCMP	__memcmp_sse4_1
25# endif
26
27#ifdef USE_AS_WMEMCMP
28# define CMPEQ	pcmpeqd
29# define CHAR_SIZE	4
30#else
31# define CMPEQ	pcmpeqb
32# define CHAR_SIZE	1
33#endif
34
35
36/* Warning!
37           wmemcmp has to use SIGNED comparison for elements.
38           memcmp has to use UNSIGNED comparison for elemnts.
39*/
40
41	.section .text.sse4.1,"ax",@progbits
42ENTRY (MEMCMP)
43# ifdef USE_AS_WMEMCMP
44	shl	$2, %RDX_LP
45# elif defined __ILP32__
46	/* Clear the upper 32 bits.  */
47	mov	%edx, %edx
48# endif
49	cmp	$79, %RDX_LP
50	ja	L(79bytesormore)
51
52	cmp	$CHAR_SIZE, %RDX_LP
53	jbe	L(firstbyte)
54
55	/* N in (CHAR_SIZE, 79) bytes.  */
56	cmpl	$32, %edx
57	ja	L(more_32_bytes)
58
59	cmpl	$16, %edx
60	jae	L(16_to_32_bytes)
61
62# ifndef USE_AS_WMEMCMP
63	cmpl	$8, %edx
64	jae	L(8_to_16_bytes)
65
66	cmpl	$4, %edx
67	jb	L(2_to_3_bytes)
68
69	movl	(%rdi), %eax
70	movl	(%rsi), %ecx
71
72	bswap	%eax
73	bswap	%ecx
74
75	shlq	$32, %rax
76	shlq	$32, %rcx
77
78	movl	-4(%rdi, %rdx), %edi
79	movl	-4(%rsi, %rdx), %esi
80
81	bswap	%edi
82	bswap	%esi
83
84	orq	%rdi, %rax
85	orq	%rsi, %rcx
86	subq	%rcx, %rax
87	cmovne	%edx, %eax
88	sbbl	%ecx, %ecx
89	orl	%ecx, %eax
90	ret
91
92	.p2align 4,, 8
93L(2_to_3_bytes):
94	movzwl	(%rdi), %eax
95	movzwl	(%rsi), %ecx
96	shll	$8, %eax
97	shll	$8, %ecx
98	bswap	%eax
99	bswap	%ecx
100	movzbl	-1(%rdi, %rdx), %edi
101	movzbl	-1(%rsi, %rdx), %esi
102	orl	%edi, %eax
103	orl	%esi, %ecx
104	subl	%ecx, %eax
105	ret
106
107	.p2align 4,, 8
108L(8_to_16_bytes):
109	movq	(%rdi), %rax
110	movq	(%rsi), %rcx
111
112	bswap	%rax
113	bswap	%rcx
114
115	subq	%rcx, %rax
116	jne	L(8_to_16_bytes_done)
117
118	movq	-8(%rdi, %rdx), %rax
119	movq	-8(%rsi, %rdx), %rcx
120
121	bswap	%rax
122	bswap	%rcx
123
124	subq	%rcx, %rax
125
126L(8_to_16_bytes_done):
127	cmovne	%edx, %eax
128	sbbl	%ecx, %ecx
129	orl	%ecx, %eax
130	ret
131# else
132	xorl	%eax, %eax
133	movl	(%rdi), %ecx
134	cmpl	(%rsi), %ecx
135	jne	L(8_to_16_bytes_done)
136	movl	4(%rdi), %ecx
137	cmpl	4(%rsi), %ecx
138	jne	L(8_to_16_bytes_done)
139	movl	-4(%rdi, %rdx), %ecx
140	cmpl	-4(%rsi, %rdx), %ecx
141	jne	L(8_to_16_bytes_done)
142	ret
143# endif
144
145	.p2align 4,, 3
146L(ret_zero):
147	xorl	%eax, %eax
148L(zero):
149	ret
150
151	.p2align 4,, 8
152L(firstbyte):
153	jb	L(ret_zero)
154# ifdef USE_AS_WMEMCMP
155	xorl	%eax, %eax
156	movl	(%rdi), %ecx
157	cmpl	(%rsi), %ecx
158	je	L(zero)
159L(8_to_16_bytes_done):
160	setg	%al
161	leal	-1(%rax, %rax), %eax
162# else
163	movzbl	(%rdi), %eax
164	movzbl	(%rsi), %ecx
165	sub	%ecx, %eax
166# endif
167	ret
168
169	.p2align 4
170L(vec_return_begin_48):
171	addq	$16, %rdi
172	addq	$16, %rsi
173L(vec_return_begin_32):
174	bsfl	%eax, %eax
175# ifdef USE_AS_WMEMCMP
176	movl	32(%rdi, %rax), %ecx
177	xorl	%edx, %edx
178	cmpl	32(%rsi, %rax), %ecx
179	setg	%dl
180	leal	-1(%rdx, %rdx), %eax
181# else
182	movzbl	32(%rsi, %rax), %ecx
183	movzbl	32(%rdi, %rax), %eax
184	subl	%ecx, %eax
185# endif
186	ret
187
188	.p2align 4
189L(vec_return_begin_16):
190	addq	$16, %rdi
191	addq	$16, %rsi
192L(vec_return_begin):
193	bsfl	%eax, %eax
194# ifdef USE_AS_WMEMCMP
195	movl	(%rdi, %rax), %ecx
196	xorl	%edx, %edx
197	cmpl	(%rsi, %rax), %ecx
198	setg	%dl
199	leal	-1(%rdx, %rdx), %eax
200# else
201	movzbl	(%rsi, %rax), %ecx
202	movzbl	(%rdi, %rax), %eax
203	subl	%ecx, %eax
204# endif
205	ret
206
207	.p2align 4
208L(vec_return_end_16):
209	subl	$16, %edx
210L(vec_return_end):
211	bsfl	%eax, %eax
212	addl	%edx, %eax
213# ifdef USE_AS_WMEMCMP
214	movl	-16(%rdi, %rax), %ecx
215	xorl	%edx, %edx
216	cmpl	-16(%rsi, %rax), %ecx
217	setg	%dl
218	leal	-1(%rdx, %rdx), %eax
219# else
220	movzbl	-16(%rsi, %rax), %ecx
221	movzbl	-16(%rdi, %rax), %eax
222	subl	%ecx, %eax
223# endif
224	ret
225
226	.p2align 4,, 8
227L(more_32_bytes):
228	movdqu	(%rdi), %xmm0
229	movdqu	(%rsi), %xmm1
230	CMPEQ	%xmm0, %xmm1
231	pmovmskb %xmm1, %eax
232	incw	%ax
233	jnz	L(vec_return_begin)
234
235	movdqu	16(%rdi), %xmm0
236	movdqu	16(%rsi), %xmm1
237	CMPEQ	%xmm0, %xmm1
238	pmovmskb %xmm1, %eax
239	incw	%ax
240	jnz	L(vec_return_begin_16)
241
242	cmpl	$64, %edx
243	jbe	L(32_to_64_bytes)
244	movdqu	32(%rdi), %xmm0
245	movdqu	32(%rsi), %xmm1
246	CMPEQ	%xmm0, %xmm1
247	pmovmskb %xmm1, %eax
248	incw	%ax
249	jnz	L(vec_return_begin_32)
250
251	.p2align 4,, 6
252L(32_to_64_bytes):
253	movdqu	-32(%rdi, %rdx), %xmm0
254	movdqu	-32(%rsi, %rdx), %xmm1
255	CMPEQ	%xmm0, %xmm1
256	pmovmskb %xmm1, %eax
257	incw	%ax
258	jnz	L(vec_return_end_16)
259
260	movdqu	-16(%rdi, %rdx), %xmm0
261	movdqu	-16(%rsi, %rdx), %xmm1
262	CMPEQ	%xmm0, %xmm1
263	pmovmskb %xmm1, %eax
264	incw	%ax
265	jnz	L(vec_return_end)
266	ret
267
268	.p2align 4
269L(16_to_32_bytes):
270	movdqu	(%rdi), %xmm0
271	movdqu	(%rsi), %xmm1
272	CMPEQ	%xmm0, %xmm1
273	pmovmskb %xmm1, %eax
274	incw	%ax
275	jnz	L(vec_return_begin)
276
277	movdqu	-16(%rdi, %rdx), %xmm0
278	movdqu	-16(%rsi, %rdx), %xmm1
279	CMPEQ	%xmm0, %xmm1
280	pmovmskb %xmm1, %eax
281	incw	%ax
282	jnz	L(vec_return_end)
283	ret
284
285
286	.p2align 4
287L(79bytesormore):
288	movdqu	(%rdi), %xmm0
289	movdqu	(%rsi), %xmm1
290	CMPEQ	%xmm0, %xmm1
291	pmovmskb %xmm1, %eax
292	incw	%ax
293	jnz	L(vec_return_begin)
294
295
296	mov	%rsi, %rcx
297	and	$-16, %rsi
298	add	$16, %rsi
299	sub	%rsi, %rcx
300
301	sub	%rcx, %rdi
302	add	%rcx, %rdx
303	test	$0xf, %rdi
304	jz	L(2aligned)
305
306	cmp	$128, %rdx
307	ja	L(128bytesormore)
308
309	.p2align 4,, 6
310L(less128bytes):
311	movdqu	(%rdi), %xmm1
312	CMPEQ	(%rsi), %xmm1
313	pmovmskb %xmm1, %eax
314	incw	%ax
315	jnz	L(vec_return_begin)
316
317	movdqu	16(%rdi), %xmm1
318	CMPEQ	16(%rsi), %xmm1
319	pmovmskb %xmm1, %eax
320	incw	%ax
321	jnz	L(vec_return_begin_16)
322
323	movdqu	32(%rdi), %xmm1
324	CMPEQ	32(%rsi), %xmm1
325	pmovmskb %xmm1, %eax
326	incw	%ax
327	jnz	L(vec_return_begin_32)
328
329	movdqu	48(%rdi), %xmm1
330	CMPEQ	48(%rsi), %xmm1
331	pmovmskb %xmm1, %eax
332	incw	%ax
333	jnz	L(vec_return_begin_48)
334
335	cmp	$96, %rdx
336	jb	L(32_to_64_bytes)
337
338	addq	$64, %rdi
339	addq	$64, %rsi
340	subq	$64, %rdx
341
342	.p2align 4,, 6
343L(last_64_bytes):
344	movdqu	(%rdi), %xmm1
345	CMPEQ	(%rsi), %xmm1
346	pmovmskb %xmm1, %eax
347	incw	%ax
348	jnz	L(vec_return_begin)
349
350	movdqu	16(%rdi), %xmm1
351	CMPEQ	16(%rsi), %xmm1
352	pmovmskb %xmm1, %eax
353	incw	%ax
354	jnz	L(vec_return_begin_16)
355
356	movdqu	-32(%rdi, %rdx), %xmm0
357	movdqu	-32(%rsi, %rdx), %xmm1
358	CMPEQ	%xmm0, %xmm1
359	pmovmskb %xmm1, %eax
360	incw	%ax
361	jnz	L(vec_return_end_16)
362
363	movdqu	-16(%rdi, %rdx), %xmm0
364	movdqu	-16(%rsi, %rdx), %xmm1
365	CMPEQ	%xmm0, %xmm1
366	pmovmskb %xmm1, %eax
367	incw	%ax
368	jnz	L(vec_return_end)
369	ret
370
371	.p2align 4
372L(128bytesormore):
373	cmp	$256, %rdx
374	ja	L(unaligned_loop)
375L(less256bytes):
376	movdqu	(%rdi), %xmm1
377	CMPEQ	(%rsi), %xmm1
378	pmovmskb %xmm1, %eax
379	incw	%ax
380	jnz	L(vec_return_begin)
381
382	movdqu	16(%rdi), %xmm1
383	CMPEQ	16(%rsi), %xmm1
384	pmovmskb %xmm1, %eax
385	incw	%ax
386	jnz	L(vec_return_begin_16)
387
388	movdqu	32(%rdi), %xmm1
389	CMPEQ	32(%rsi), %xmm1
390	pmovmskb %xmm1, %eax
391	incw	%ax
392	jnz	L(vec_return_begin_32)
393
394	movdqu	48(%rdi), %xmm1
395	CMPEQ	48(%rsi), %xmm1
396	pmovmskb %xmm1, %eax
397	incw	%ax
398	jnz	L(vec_return_begin_48)
399
400	addq	$64, %rdi
401	addq	$64, %rsi
402
403	movdqu	(%rdi), %xmm1
404	CMPEQ	(%rsi), %xmm1
405	pmovmskb %xmm1, %eax
406	incw	%ax
407	jnz	L(vec_return_begin)
408
409	movdqu	16(%rdi), %xmm1
410	CMPEQ	16(%rsi), %xmm1
411	pmovmskb %xmm1, %eax
412	incw	%ax
413	jnz	L(vec_return_begin_16)
414
415	movdqu	32(%rdi), %xmm1
416	CMPEQ	32(%rsi), %xmm1
417	pmovmskb %xmm1, %eax
418	incw	%ax
419	jnz	L(vec_return_begin_32)
420
421	movdqu	48(%rdi), %xmm1
422	CMPEQ	48(%rsi), %xmm1
423	pmovmskb %xmm1, %eax
424	incw	%ax
425	jnz	L(vec_return_begin_48)
426
427	addq	$-128, %rdx
428	subq	$-64, %rsi
429	subq	$-64, %rdi
430
431	cmp	$64, %rdx
432	ja	L(less128bytes)
433
434	cmp	$32, %rdx
435	ja	L(last_64_bytes)
436
437	movdqu	-32(%rdi, %rdx), %xmm0
438	movdqu	-32(%rsi, %rdx), %xmm1
439	CMPEQ	%xmm0, %xmm1
440	pmovmskb %xmm1, %eax
441	incw	%ax
442	jnz	L(vec_return_end_16)
443
444	movdqu	-16(%rdi, %rdx), %xmm0
445	movdqu	-16(%rsi, %rdx), %xmm1
446	CMPEQ	%xmm0, %xmm1
447	pmovmskb %xmm1, %eax
448	incw	%ax
449	jnz	L(vec_return_end)
450	ret
451
452	.p2align 4
453L(unaligned_loop):
454# ifdef DATA_CACHE_SIZE_HALF
455	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
456# else
457	mov	__x86_data_cache_size_half(%rip), %R8_LP
458# endif
459	movq	%r8, %r9
460	addq	%r8, %r8
461	addq	%r9, %r8
462	cmpq	%r8, %rdx
463	ja	L(L2_L3_cache_unaligned)
464	sub	$64, %rdx
465	.p2align 4
466L(64bytesormore_loop):
467	movdqu	(%rdi), %xmm0
468	movdqu	16(%rdi), %xmm1
469	movdqu	32(%rdi), %xmm2
470	movdqu	48(%rdi), %xmm3
471
472	CMPEQ	(%rsi), %xmm0
473	CMPEQ	16(%rsi), %xmm1
474	CMPEQ	32(%rsi), %xmm2
475	CMPEQ	48(%rsi), %xmm3
476
477	pand	%xmm0, %xmm1
478	pand	%xmm2, %xmm3
479	pand	%xmm1, %xmm3
480
481	pmovmskb %xmm3, %eax
482	incw	%ax
483	jnz	L(64bytesormore_loop_end)
484
485	add	$64, %rsi
486	add	$64, %rdi
487	sub	$64, %rdx
488	ja	L(64bytesormore_loop)
489
490	.p2align 4,, 6
491L(loop_tail):
492	addq	%rdx, %rdi
493	movdqu	(%rdi), %xmm0
494	movdqu	16(%rdi), %xmm1
495	movdqu	32(%rdi), %xmm2
496	movdqu	48(%rdi), %xmm3
497
498	addq	%rdx, %rsi
499	movdqu	(%rsi), %xmm4
500	movdqu	16(%rsi), %xmm5
501	movdqu	32(%rsi), %xmm6
502	movdqu	48(%rsi), %xmm7
503
504	CMPEQ	%xmm4, %xmm0
505	CMPEQ	%xmm5, %xmm1
506	CMPEQ	%xmm6, %xmm2
507	CMPEQ	%xmm7, %xmm3
508
509	pand	%xmm0, %xmm1
510	pand	%xmm2, %xmm3
511	pand	%xmm1, %xmm3
512
513	pmovmskb %xmm3, %eax
514	incw	%ax
515	jnz	L(64bytesormore_loop_end)
516	ret
517
518L(L2_L3_cache_unaligned):
519	subq	$64, %rdx
520	.p2align 4
521L(L2_L3_unaligned_128bytes_loop):
522	prefetchnta 0x1c0(%rdi)
523	prefetchnta 0x1c0(%rsi)
524
525	movdqu	(%rdi), %xmm0
526	movdqu	16(%rdi), %xmm1
527	movdqu	32(%rdi), %xmm2
528	movdqu	48(%rdi), %xmm3
529
530	CMPEQ	(%rsi), %xmm0
531	CMPEQ	16(%rsi), %xmm1
532	CMPEQ	32(%rsi), %xmm2
533	CMPEQ	48(%rsi), %xmm3
534
535	pand	%xmm0, %xmm1
536	pand	%xmm2, %xmm3
537	pand	%xmm1, %xmm3
538
539	pmovmskb %xmm3, %eax
540	incw	%ax
541	jnz	L(64bytesormore_loop_end)
542
543	add	$64, %rsi
544	add	$64, %rdi
545	sub	$64, %rdx
546	ja	L(L2_L3_unaligned_128bytes_loop)
547	jmp	L(loop_tail)
548
549
550	/* This case is for machines which are sensitive for unaligned
551	 * instructions.  */
552	.p2align 4
553L(2aligned):
554	cmp	$128, %rdx
555	ja	L(128bytesormorein2aligned)
556L(less128bytesin2aligned):
557	movdqa	(%rdi), %xmm1
558	CMPEQ	(%rsi), %xmm1
559	pmovmskb %xmm1, %eax
560	incw	%ax
561	jnz	L(vec_return_begin)
562
563	movdqa	16(%rdi), %xmm1
564	CMPEQ	16(%rsi), %xmm1
565	pmovmskb %xmm1, %eax
566	incw	%ax
567	jnz	L(vec_return_begin_16)
568
569	movdqa	32(%rdi), %xmm1
570	CMPEQ	32(%rsi), %xmm1
571	pmovmskb %xmm1, %eax
572	incw	%ax
573	jnz	L(vec_return_begin_32)
574
575	movdqa	48(%rdi), %xmm1
576	CMPEQ	48(%rsi), %xmm1
577	pmovmskb %xmm1, %eax
578	incw	%ax
579	jnz	L(vec_return_begin_48)
580
581	cmp	$96, %rdx
582	jb	L(32_to_64_bytes)
583
584	addq	$64, %rdi
585	addq	$64, %rsi
586	subq	$64, %rdx
587
588	.p2align 4,, 6
589L(aligned_last_64_bytes):
590	movdqa	(%rdi), %xmm1
591	CMPEQ	(%rsi), %xmm1
592	pmovmskb %xmm1, %eax
593	incw	%ax
594	jnz	L(vec_return_begin)
595
596	movdqa	16(%rdi), %xmm1
597	CMPEQ	16(%rsi), %xmm1
598	pmovmskb %xmm1, %eax
599	incw	%ax
600	jnz	L(vec_return_begin_16)
601
602	movdqu	-32(%rdi, %rdx), %xmm0
603	movdqu	-32(%rsi, %rdx), %xmm1
604	CMPEQ	%xmm0, %xmm1
605	pmovmskb %xmm1, %eax
606	incw	%ax
607	jnz	L(vec_return_end_16)
608
609	movdqu	-16(%rdi, %rdx), %xmm0
610	movdqu	-16(%rsi, %rdx), %xmm1
611	CMPEQ	%xmm0, %xmm1
612	pmovmskb %xmm1, %eax
613	incw	%ax
614	jnz	L(vec_return_end)
615	ret
616
617	.p2align 4
618L(128bytesormorein2aligned):
619	cmp	$256, %rdx
620	ja	L(aligned_loop)
621L(less256bytesin2alinged):
622	movdqa	(%rdi), %xmm1
623	CMPEQ	(%rsi), %xmm1
624	pmovmskb %xmm1, %eax
625	incw	%ax
626	jnz	L(vec_return_begin)
627
628	movdqa	16(%rdi), %xmm1
629	CMPEQ	16(%rsi), %xmm1
630	pmovmskb %xmm1, %eax
631	incw	%ax
632	jnz	L(vec_return_begin_16)
633
634	movdqa	32(%rdi), %xmm1
635	CMPEQ	32(%rsi), %xmm1
636	pmovmskb %xmm1, %eax
637	incw	%ax
638	jnz	L(vec_return_begin_32)
639
640	movdqa	48(%rdi), %xmm1
641	CMPEQ	48(%rsi), %xmm1
642	pmovmskb %xmm1, %eax
643	incw	%ax
644	jnz	L(vec_return_begin_48)
645
646	addq	$64, %rdi
647	addq	$64, %rsi
648
649	movdqa	(%rdi), %xmm1
650	CMPEQ	(%rsi), %xmm1
651	pmovmskb %xmm1, %eax
652	incw	%ax
653	jnz	L(vec_return_begin)
654
655	movdqa	16(%rdi), %xmm1
656	CMPEQ	16(%rsi), %xmm1
657	pmovmskb %xmm1, %eax
658	incw	%ax
659	jnz	L(vec_return_begin_16)
660
661	movdqa	32(%rdi), %xmm1
662	CMPEQ	32(%rsi), %xmm1
663	pmovmskb %xmm1, %eax
664	incw	%ax
665	jnz	L(vec_return_begin_32)
666
667	movdqa	48(%rdi), %xmm1
668	CMPEQ	48(%rsi), %xmm1
669	pmovmskb %xmm1, %eax
670	incw	%ax
671	jnz	L(vec_return_begin_48)
672
673	addq	$-128, %rdx
674	subq	$-64, %rsi
675	subq	$-64, %rdi
676
677	cmp	$64, %rdx
678	ja	L(less128bytesin2aligned)
679
680	cmp	$32, %rdx
681	ja	L(aligned_last_64_bytes)
682
683	movdqu	-32(%rdi, %rdx), %xmm0
684	movdqu	-32(%rsi, %rdx), %xmm1
685	CMPEQ	%xmm0, %xmm1
686	pmovmskb %xmm1, %eax
687	incw	%ax
688	jnz	L(vec_return_end_16)
689
690	movdqu	-16(%rdi, %rdx), %xmm0
691	movdqu	-16(%rsi, %rdx), %xmm1
692	CMPEQ	%xmm0, %xmm1
693	pmovmskb %xmm1, %eax
694	incw	%ax
695	jnz	L(vec_return_end)
696	ret
697
698	.p2align 4
699L(aligned_loop):
700# ifdef DATA_CACHE_SIZE_HALF
701	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
702# else
703	mov	__x86_data_cache_size_half(%rip), %R8_LP
704# endif
705	movq	%r8, %r9
706	addq	%r8, %r8
707	addq	%r9, %r8
708	cmpq	%r8, %rdx
709	ja	L(L2_L3_cache_aligned)
710
711	sub	$64, %rdx
712	.p2align 4
713L(64bytesormore_loopin2aligned):
714	movdqa	(%rdi), %xmm0
715	movdqa	16(%rdi), %xmm1
716	movdqa	32(%rdi), %xmm2
717	movdqa	48(%rdi), %xmm3
718
719	CMPEQ	(%rsi), %xmm0
720	CMPEQ	16(%rsi), %xmm1
721	CMPEQ	32(%rsi), %xmm2
722	CMPEQ	48(%rsi), %xmm3
723
724	pand	%xmm0, %xmm1
725	pand	%xmm2, %xmm3
726	pand	%xmm1, %xmm3
727
728	pmovmskb %xmm3, %eax
729	incw	%ax
730	jnz	L(64bytesormore_loop_end)
731	add	$64, %rsi
732	add	$64, %rdi
733	sub	$64, %rdx
734	ja	L(64bytesormore_loopin2aligned)
735	jmp	L(loop_tail)
736
737L(L2_L3_cache_aligned):
738	subq	$64, %rdx
739	.p2align 4
740L(L2_L3_aligned_128bytes_loop):
741	prefetchnta 0x1c0(%rdi)
742	prefetchnta 0x1c0(%rsi)
743	movdqa	(%rdi), %xmm0
744	movdqa	16(%rdi), %xmm1
745	movdqa	32(%rdi), %xmm2
746	movdqa	48(%rdi), %xmm3
747
748	CMPEQ	(%rsi), %xmm0
749	CMPEQ	16(%rsi), %xmm1
750	CMPEQ	32(%rsi), %xmm2
751	CMPEQ	48(%rsi), %xmm3
752
753	pand	%xmm0, %xmm1
754	pand	%xmm2, %xmm3
755	pand	%xmm1, %xmm3
756
757	pmovmskb %xmm3, %eax
758	incw	%ax
759	jnz	L(64bytesormore_loop_end)
760
761	addq	$64, %rsi
762	addq	$64, %rdi
763	subq	$64, %rdx
764	ja	L(L2_L3_aligned_128bytes_loop)
765	jmp	L(loop_tail)
766
767	.p2align 4
768L(64bytesormore_loop_end):
769	pmovmskb %xmm0, %ecx
770	incw	%cx
771	jnz	L(loop_end_ret)
772
773	pmovmskb %xmm1, %ecx
774	notw	%cx
775	sall	$16, %ecx
776	jnz	L(loop_end_ret)
777
778	pmovmskb %xmm2, %ecx
779	notw	%cx
780	shlq	$32, %rcx
781	jnz	L(loop_end_ret)
782
783	addq	$48, %rdi
784	addq	$48, %rsi
785	movq	%rax, %rcx
786
787	.p2align 4,, 6
788L(loop_end_ret):
789	bsfq	%rcx, %rcx
790# ifdef USE_AS_WMEMCMP
791	movl	(%rdi, %rcx), %eax
792	xorl	%edx, %edx
793	cmpl	(%rsi, %rcx), %eax
794	setg	%dl
795	leal	-1(%rdx, %rdx), %eax
796# else
797	movzbl	(%rdi, %rcx), %eax
798	movzbl	(%rsi, %rcx), %ecx
799	subl	%ecx, %eax
800# endif
801	ret
802END (MEMCMP)
803#endif
804