1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24#  define STRCMP	__strcmp_evex
25# endif
26
27# define PAGE_SIZE	4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE	32
31
32/* Shift for dividing by (VEC_SIZE * 4).  */
33# define DIVIDE_BY_VEC_4_SHIFT	7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# define VMOVU		vmovdqu64
39# define VMOVA		vmovdqa64
40
41# ifdef USE_AS_WCSCMP
42/* Compare packed dwords.  */
43#  define VPCMP		vpcmpd
44#  define VPMINU	vpminud
45#  define VPTESTM	vptestmd
46#  define SHIFT_REG32	r8d
47#  define SHIFT_REG64	r8
48/* 1 dword char == 4 bytes.  */
49#  define SIZE_OF_CHAR	4
50# else
51/* Compare packed bytes.  */
52#  define VPCMP		vpcmpb
53#  define VPMINU	vpminub
54#  define VPTESTM	vptestmb
55#  define SHIFT_REG32	ecx
56#  define SHIFT_REG64	rcx
57/* 1 byte char == 1 byte.  */
58#  define SIZE_OF_CHAR	1
59# endif
60
61# define XMMZERO	xmm16
62# define XMM0		xmm17
63# define XMM1		xmm18
64
65# define YMMZERO	ymm16
66# define YMM0		ymm17
67# define YMM1		ymm18
68# define YMM2		ymm19
69# define YMM3		ymm20
70# define YMM4		ymm21
71# define YMM5		ymm22
72# define YMM6		ymm23
73# define YMM7		ymm24
74# define YMM8		ymm25
75# define YMM9		ymm26
76# define YMM10		ymm27
77
78/* Warning!
79           wcscmp/wcsncmp have to use SIGNED comparison for elements.
80           strcmp/strncmp have to use UNSIGNED comparison for elements.
81*/
82
83/* The main idea of the string comparison (byte or dword) using 256-bit
84   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
85   latter can be on either packed bytes or dwords depending on
86   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
87   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
88   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
89   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
90   instructions.  Main loop (away from from page boundary) compares 4
91   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
92   bytes) on each loop.
93
94   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
95   is the same as strcmp, except that an a maximum offset is tracked.  If
96   the maximum offset is reached before a difference is found, zero is
97   returned.  */
98
99	.section .text.evex,"ax",@progbits
100ENTRY (STRCMP)
101# ifdef USE_AS_STRNCMP
102	/* Check for simple cases (0 or 1) in offset.  */
103	cmp	$1, %RDX_LP
104	je	L(char0)
105	jb	L(zero)
106#  ifdef USE_AS_WCSCMP
107	/* Convert units: from wide to byte char.  */
108	shl	$2, %RDX_LP
109#  endif
110	/* Register %r11 tracks the maximum offset.  */
111	mov	%RDX_LP, %R11_LP
112# endif
113	movl	%edi, %eax
114	xorl	%edx, %edx
115	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
116	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
117	orl	%esi, %eax
118	andl	$(PAGE_SIZE - 1), %eax
119	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
120	jg	L(cross_page)
121	/* Start comparing 4 vectors.  */
122	VMOVU	(%rdi), %YMM0
123
124	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
125	VPTESTM	%YMM0, %YMM0, %k2
126
127	/* Each bit cleared in K1 represents a mismatch or a null CHAR
128	   in YMM0 and 32 bytes at (%rsi).  */
129	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
130
131	kmovd	%k1, %ecx
132# ifdef USE_AS_WCSCMP
133	subl	$0xff, %ecx
134# else
135	incl	%ecx
136# endif
137	je	L(next_3_vectors)
138	tzcntl	%ecx, %edx
139# ifdef USE_AS_WCSCMP
140	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
141	sall	$2, %edx
142# endif
143# ifdef USE_AS_STRNCMP
144	/* Return 0 if the mismatched index (%rdx) is after the maximum
145	   offset (%r11).   */
146	cmpq	%r11, %rdx
147	jae	L(zero)
148# endif
149# ifdef USE_AS_WCSCMP
150	xorl	%eax, %eax
151	movl	(%rdi, %rdx), %ecx
152	cmpl	(%rsi, %rdx), %ecx
153	je	L(return)
154L(wcscmp_return):
155	setl	%al
156	negl	%eax
157	orl	$1, %eax
158L(return):
159# else
160	movzbl	(%rdi, %rdx), %eax
161	movzbl	(%rsi, %rdx), %edx
162	subl	%edx, %eax
163# endif
164	ret
165
166L(return_vec_size):
167	tzcntl	%ecx, %edx
168# ifdef USE_AS_WCSCMP
169	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
170	sall	$2, %edx
171# endif
172# ifdef USE_AS_STRNCMP
173	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
174	   the maximum offset (%r11).  */
175	addq	$VEC_SIZE, %rdx
176	cmpq	%r11, %rdx
177	jae	L(zero)
178#  ifdef USE_AS_WCSCMP
179	xorl	%eax, %eax
180	movl	(%rdi, %rdx), %ecx
181	cmpl	(%rsi, %rdx), %ecx
182	jne	L(wcscmp_return)
183#  else
184	movzbl	(%rdi, %rdx), %eax
185	movzbl	(%rsi, %rdx), %edx
186	subl	%edx, %eax
187#  endif
188# else
189#  ifdef USE_AS_WCSCMP
190	xorl	%eax, %eax
191	movl	VEC_SIZE(%rdi, %rdx), %ecx
192	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
193	jne	L(wcscmp_return)
194#  else
195	movzbl	VEC_SIZE(%rdi, %rdx), %eax
196	movzbl	VEC_SIZE(%rsi, %rdx), %edx
197	subl	%edx, %eax
198#  endif
199# endif
200	ret
201
202L(return_2_vec_size):
203	tzcntl	%ecx, %edx
204# ifdef USE_AS_WCSCMP
205	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
206	sall	$2, %edx
207# endif
208# ifdef USE_AS_STRNCMP
209	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
210	   after the maximum offset (%r11).  */
211	addq	$(VEC_SIZE * 2), %rdx
212	cmpq	%r11, %rdx
213	jae	L(zero)
214#  ifdef USE_AS_WCSCMP
215	xorl	%eax, %eax
216	movl	(%rdi, %rdx), %ecx
217	cmpl	(%rsi, %rdx), %ecx
218	jne	L(wcscmp_return)
219#  else
220	movzbl	(%rdi, %rdx), %eax
221	movzbl	(%rsi, %rdx), %edx
222	subl	%edx, %eax
223#  endif
224# else
225#  ifdef USE_AS_WCSCMP
226	xorl	%eax, %eax
227	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
228	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
229	jne	L(wcscmp_return)
230#  else
231	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
232	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
233	subl	%edx, %eax
234#  endif
235# endif
236	ret
237
238L(return_3_vec_size):
239	tzcntl	%ecx, %edx
240# ifdef USE_AS_WCSCMP
241	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
242	sall	$2, %edx
243# endif
244# ifdef USE_AS_STRNCMP
245	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
246	   after the maximum offset (%r11).  */
247	addq	$(VEC_SIZE * 3), %rdx
248	cmpq	%r11, %rdx
249	jae	L(zero)
250#  ifdef USE_AS_WCSCMP
251	xorl	%eax, %eax
252	movl	(%rdi, %rdx), %ecx
253	cmpl	(%rsi, %rdx), %ecx
254	jne	L(wcscmp_return)
255#  else
256	movzbl	(%rdi, %rdx), %eax
257	movzbl	(%rsi, %rdx), %edx
258	subl	%edx, %eax
259#  endif
260# else
261#  ifdef USE_AS_WCSCMP
262	xorl	%eax, %eax
263	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
264	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
265	jne	L(wcscmp_return)
266#  else
267	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
268	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
269	subl	%edx, %eax
270#  endif
271# endif
272	ret
273
274	.p2align 4
275L(next_3_vectors):
276	VMOVU	VEC_SIZE(%rdi), %YMM0
277	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
278	VPTESTM	%YMM0, %YMM0, %k2
279	/* Each bit cleared in K1 represents a mismatch or a null CHAR
280	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
281	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
282	kmovd	%k1, %ecx
283# ifdef USE_AS_WCSCMP
284	subl	$0xff, %ecx
285# else
286	incl	%ecx
287# endif
288	jne	L(return_vec_size)
289
290	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
291	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
292	VPTESTM	%YMM0, %YMM0, %k2
293	/* Each bit cleared in K1 represents a mismatch or a null CHAR
294	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
295	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
296	kmovd	%k1, %ecx
297# ifdef USE_AS_WCSCMP
298	subl	$0xff, %ecx
299# else
300	incl	%ecx
301# endif
302	jne	L(return_2_vec_size)
303
304	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
305	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
306	VPTESTM	%YMM0, %YMM0, %k2
307	/* Each bit cleared in K1 represents a mismatch or a null CHAR
308	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
309	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
310	kmovd	%k1, %ecx
311# ifdef USE_AS_WCSCMP
312	subl	$0xff, %ecx
313# else
314	incl	%ecx
315# endif
316	jne	L(return_3_vec_size)
317L(main_loop_header):
318	leaq	(VEC_SIZE * 4)(%rdi), %rdx
319	movl	$PAGE_SIZE, %ecx
320	/* Align load via RAX.  */
321	andq	$-(VEC_SIZE * 4), %rdx
322	subq	%rdi, %rdx
323	leaq	(%rdi, %rdx), %rax
324# ifdef USE_AS_STRNCMP
325	/* Starting from this point, the maximum offset, or simply the
326	   'offset', DECREASES by the same amount when base pointers are
327	   moved forward.  Return 0 when:
328	     1) On match: offset <= the matched vector index.
329	     2) On mistmach, offset is before the mistmatched index.
330	 */
331	subq	%rdx, %r11
332	jbe	L(zero)
333# endif
334	addq	%rsi, %rdx
335	movq	%rdx, %rsi
336	andl	$(PAGE_SIZE - 1), %esi
337	/* Number of bytes before page crossing.  */
338	subq	%rsi, %rcx
339	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
340	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
341	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
342	movl	%ecx, %esi
343	jmp	L(loop_start)
344
345	.p2align 4
346L(loop):
347# ifdef USE_AS_STRNCMP
348	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
349	   the maximum offset (%r11) by the same amount.  */
350	subq	$(VEC_SIZE * 4), %r11
351	jbe	L(zero)
352# endif
353	addq	$(VEC_SIZE * 4), %rax
354	addq	$(VEC_SIZE * 4), %rdx
355L(loop_start):
356	testl	%esi, %esi
357	leal	-1(%esi), %esi
358	je	L(loop_cross_page)
359L(back_to_loop):
360	/* Main loop, comparing 4 vectors are a time.  */
361	VMOVA	(%rax), %YMM0
362	VMOVA	VEC_SIZE(%rax), %YMM2
363	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
364	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
365
366	VPMINU	%YMM0, %YMM2, %YMM8
367	VPMINU	%YMM4, %YMM6, %YMM9
368
369	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
370	VPMINU	%YMM8, %YMM9, %YMM8
371
372	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
373	VPTESTM	%YMM8, %YMM8, %k1
374
375	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
376	vpxorq	(%rdx), %YMM0, %YMM1
377	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
378	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
379	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
380
381	vporq	%YMM1, %YMM3, %YMM9
382	vporq	%YMM5, %YMM7, %YMM10
383
384	/* A non-zero CHAR in YMM9 represents a mismatch.  */
385	vporq	%YMM9, %YMM10, %YMM9
386
387	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
388	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
389	kmovd   %k0, %ecx
390# ifdef USE_AS_WCSCMP
391	subl	$0xff, %ecx
392# else
393	incl	%ecx
394# endif
395	je	 L(loop)
396
397	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
398	VPTESTM	%YMM0, %YMM0, %k1
399	/* Each bit cleared in K0 represents a mismatch or a null CHAR
400	   in YMM0 and (%rdx).  */
401	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
402	kmovd	%k0, %ecx
403# ifdef USE_AS_WCSCMP
404	subl	$0xff, %ecx
405# else
406	incl	%ecx
407# endif
408	je	L(test_vec)
409	tzcntl	%ecx, %ecx
410# ifdef USE_AS_WCSCMP
411	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
412	sall	$2, %ecx
413# endif
414# ifdef USE_AS_STRNCMP
415	cmpq	%rcx, %r11
416	jbe	L(zero)
417#  ifdef USE_AS_WCSCMP
418	movq	%rax, %rsi
419	xorl	%eax, %eax
420	movl	(%rsi, %rcx), %edi
421	cmpl	(%rdx, %rcx), %edi
422	jne	L(wcscmp_return)
423#  else
424	movzbl	(%rax, %rcx), %eax
425	movzbl	(%rdx, %rcx), %edx
426	subl	%edx, %eax
427#  endif
428# else
429#  ifdef USE_AS_WCSCMP
430	movq	%rax, %rsi
431	xorl	%eax, %eax
432	movl	(%rsi, %rcx), %edi
433	cmpl	(%rdx, %rcx), %edi
434	jne	L(wcscmp_return)
435#  else
436	movzbl	(%rax, %rcx), %eax
437	movzbl	(%rdx, %rcx), %edx
438	subl	%edx, %eax
439#  endif
440# endif
441	ret
442
443	.p2align 4
444L(test_vec):
445# ifdef USE_AS_STRNCMP
446	/* The first vector matched.  Return 0 if the maximum offset
447	   (%r11) <= VEC_SIZE.  */
448	cmpq	$VEC_SIZE, %r11
449	jbe	L(zero)
450# endif
451	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
452	VPTESTM	%YMM2, %YMM2, %k1
453	/* Each bit cleared in K0 represents a mismatch or a null CHAR
454	   in YMM2 and VEC_SIZE(%rdx).  */
455	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
456	kmovd	%k0, %ecx
457# ifdef USE_AS_WCSCMP
458	subl	$0xff, %ecx
459# else
460	incl	%ecx
461# endif
462	je	L(test_2_vec)
463	tzcntl	%ecx, %edi
464# ifdef USE_AS_WCSCMP
465	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
466	sall	$2, %edi
467# endif
468# ifdef USE_AS_STRNCMP
469	addq	$VEC_SIZE, %rdi
470	cmpq	%rdi, %r11
471	jbe	L(zero)
472#  ifdef USE_AS_WCSCMP
473	movq	%rax, %rsi
474	xorl	%eax, %eax
475	movl	(%rsi, %rdi), %ecx
476	cmpl	(%rdx, %rdi), %ecx
477	jne	L(wcscmp_return)
478#  else
479	movzbl	(%rax, %rdi), %eax
480	movzbl	(%rdx, %rdi), %edx
481	subl	%edx, %eax
482#  endif
483# else
484#  ifdef USE_AS_WCSCMP
485	movq	%rax, %rsi
486	xorl	%eax, %eax
487	movl	VEC_SIZE(%rsi, %rdi), %ecx
488	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
489	jne	L(wcscmp_return)
490#  else
491	movzbl	VEC_SIZE(%rax, %rdi), %eax
492	movzbl	VEC_SIZE(%rdx, %rdi), %edx
493	subl	%edx, %eax
494#  endif
495# endif
496	ret
497
498	.p2align 4
499L(test_2_vec):
500# ifdef USE_AS_STRNCMP
501	/* The first 2 vectors matched.  Return 0 if the maximum offset
502	   (%r11) <= 2 * VEC_SIZE.  */
503	cmpq	$(VEC_SIZE * 2), %r11
504	jbe	L(zero)
505# endif
506	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
507	VPTESTM	%YMM4, %YMM4, %k1
508	/* Each bit cleared in K0 represents a mismatch or a null CHAR
509	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
510	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
511	kmovd	%k0, %ecx
512# ifdef USE_AS_WCSCMP
513	subl	$0xff, %ecx
514# else
515	incl	%ecx
516# endif
517	je	L(test_3_vec)
518	tzcntl	%ecx, %edi
519# ifdef USE_AS_WCSCMP
520	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
521	sall	$2, %edi
522# endif
523# ifdef USE_AS_STRNCMP
524	addq	$(VEC_SIZE * 2), %rdi
525	cmpq	%rdi, %r11
526	jbe	L(zero)
527#  ifdef USE_AS_WCSCMP
528	movq	%rax, %rsi
529	xorl	%eax, %eax
530	movl	(%rsi, %rdi), %ecx
531	cmpl	(%rdx, %rdi), %ecx
532	jne	L(wcscmp_return)
533#  else
534	movzbl	(%rax, %rdi), %eax
535	movzbl	(%rdx, %rdi), %edx
536	subl	%edx, %eax
537#  endif
538# else
539#  ifdef USE_AS_WCSCMP
540	movq	%rax, %rsi
541	xorl	%eax, %eax
542	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
543	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
544	jne	L(wcscmp_return)
545#  else
546	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
547	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
548	subl	%edx, %eax
549#  endif
550# endif
551	ret
552
553	.p2align 4
554L(test_3_vec):
555# ifdef USE_AS_STRNCMP
556	/* The first 3 vectors matched.  Return 0 if the maximum offset
557	   (%r11) <= 3 * VEC_SIZE.  */
558	cmpq	$(VEC_SIZE * 3), %r11
559	jbe	L(zero)
560# endif
561	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
562	VPTESTM	%YMM6, %YMM6, %k1
563	/* Each bit cleared in K0 represents a mismatch or a null CHAR
564	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
565	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
566	kmovd	%k0, %ecx
567# ifdef USE_AS_WCSCMP
568	subl	$0xff, %ecx
569# else
570	incl	%ecx
571# endif
572	tzcntl	%ecx, %ecx
573# ifdef USE_AS_WCSCMP
574	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
575	sall	$2, %ecx
576# endif
577# ifdef USE_AS_STRNCMP
578	addq	$(VEC_SIZE * 3), %rcx
579	cmpq	%rcx, %r11
580	jbe	L(zero)
581#  ifdef USE_AS_WCSCMP
582	movq	%rax, %rsi
583	xorl	%eax, %eax
584	movl	(%rsi, %rcx), %esi
585	cmpl	(%rdx, %rcx), %esi
586	jne	L(wcscmp_return)
587#  else
588	movzbl	(%rax, %rcx), %eax
589	movzbl	(%rdx, %rcx), %edx
590	subl	%edx, %eax
591#  endif
592# else
593#  ifdef USE_AS_WCSCMP
594	movq	%rax, %rsi
595	xorl	%eax, %eax
596	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
597	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
598	jne	L(wcscmp_return)
599#  else
600	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
601	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
602	subl	%edx, %eax
603#  endif
604# endif
605	ret
606
607	.p2align 4
608L(loop_cross_page):
609	xorl	%r10d, %r10d
610	movq	%rdx, %rcx
611	/* Align load via RDX.  We load the extra ECX bytes which should
612	   be ignored.  */
613	andl	$((VEC_SIZE * 4) - 1), %ecx
614	/* R10 is -RCX.  */
615	subq	%rcx, %r10
616
617	/* This works only if VEC_SIZE * 2 == 64. */
618# if (VEC_SIZE * 2) != 64
619#  error (VEC_SIZE * 2) != 64
620# endif
621
622	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
623	cmpl	$(VEC_SIZE * 2), %ecx
624	jge	L(loop_cross_page_2_vec)
625
626	VMOVU	(%rax, %r10), %YMM2
627	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
628
629	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
630	VPTESTM	%YMM2, %YMM2, %k2
631	/* Each bit cleared in K1 represents a mismatch or a null CHAR
632	   in YMM2 and 32 bytes at (%rdx, %r10).  */
633	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
634	kmovd	%k1, %r9d
635	/* Don't use subl since it is the lower 16/32 bits of RDI
636	   below.  */
637	notl	%r9d
638# ifdef USE_AS_WCSCMP
639	/* Only last 8 bits are valid.  */
640	andl	$0xff, %r9d
641# endif
642
643	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
644	VPTESTM	%YMM3, %YMM3, %k4
645	/* Each bit cleared in K3 represents a mismatch or a null CHAR
646	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
647	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
648	kmovd	%k3, %edi
649    /* Must use notl %edi here as lower bits are for CHAR
650	   comparisons potentially out of range thus can be 0 without
651	   indicating mismatch.  */
652	notl	%edi
653# ifdef USE_AS_WCSCMP
654	/* Don't use subl since it is the upper 8 bits of EDI below.  */
655	andl	$0xff, %edi
656# endif
657
658# ifdef USE_AS_WCSCMP
659	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
660	sall	$8, %edi
661	/* NB: Divide shift count by 4 since each bit in K1 represent 4
662	   bytes.  */
663	movl	%ecx, %SHIFT_REG32
664	sarl	$2, %SHIFT_REG32
665
666	/* Each bit in EDI represents a null CHAR or a mismatch.  */
667	orl	%r9d, %edi
668# else
669	salq	$32, %rdi
670
671	/* Each bit in RDI represents a null CHAR or a mismatch.  */
672	orq	%r9, %rdi
673# endif
674
675	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
676	shrxq	%SHIFT_REG64, %rdi, %rdi
677	testq	%rdi, %rdi
678	je	L(loop_cross_page_2_vec)
679	tzcntq	%rdi, %rcx
680# ifdef USE_AS_WCSCMP
681	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
682	sall	$2, %ecx
683# endif
684# ifdef USE_AS_STRNCMP
685	cmpq	%rcx, %r11
686	jbe	L(zero)
687#  ifdef USE_AS_WCSCMP
688	movq	%rax, %rsi
689	xorl	%eax, %eax
690	movl	(%rsi, %rcx), %edi
691	cmpl	(%rdx, %rcx), %edi
692	jne	L(wcscmp_return)
693#  else
694	movzbl	(%rax, %rcx), %eax
695	movzbl	(%rdx, %rcx), %edx
696	subl	%edx, %eax
697#  endif
698# else
699#  ifdef USE_AS_WCSCMP
700	movq	%rax, %rsi
701	xorl	%eax, %eax
702	movl	(%rsi, %rcx), %edi
703	cmpl	(%rdx, %rcx), %edi
704	jne	L(wcscmp_return)
705#  else
706	movzbl	(%rax, %rcx), %eax
707	movzbl	(%rdx, %rcx), %edx
708	subl	%edx, %eax
709#  endif
710# endif
711	ret
712
713	.p2align 4
714L(loop_cross_page_2_vec):
715	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
716	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
717	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
718
719	VPTESTM	%YMM0, %YMM0, %k2
720	/* Each bit cleared in K1 represents a mismatch or a null CHAR
721	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
722	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
723	kmovd	%k1, %r9d
724	/* Don't use subl since it is the lower 16/32 bits of RDI
725	   below.  */
726	notl	%r9d
727# ifdef USE_AS_WCSCMP
728	/* Only last 8 bits are valid.  */
729	andl	$0xff, %r9d
730# endif
731
732	VPTESTM	%YMM1, %YMM1, %k4
733	/* Each bit cleared in K3 represents a mismatch or a null CHAR
734	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
735	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
736	kmovd	%k3, %edi
737	/* Must use notl %edi here as lower bits are for CHAR
738	   comparisons potentially out of range thus can be 0 without
739	   indicating mismatch.  */
740	notl	%edi
741# ifdef USE_AS_WCSCMP
742	/* Don't use subl since it is the upper 8 bits of EDI below.  */
743	andl	$0xff, %edi
744# endif
745
746# ifdef USE_AS_WCSCMP
747	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
748	sall	$8, %edi
749
750	/* Each bit in EDI represents a null CHAR or a mismatch.  */
751	orl	%r9d, %edi
752# else
753	salq	$32, %rdi
754
755	/* Each bit in RDI represents a null CHAR or a mismatch.  */
756	orq	%r9, %rdi
757# endif
758
759	xorl	%r8d, %r8d
760	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
761	subl	$(VEC_SIZE * 2), %ecx
762	jle	1f
763	/* R8 has number of bytes skipped.  */
764	movl	%ecx, %r8d
765# ifdef USE_AS_WCSCMP
766	/* NB: Divide shift count by 4 since each bit in RDI represent 4
767	   bytes.  */
768	sarl	$2, %ecx
769	/* Skip ECX bytes.  */
770	shrl	%cl, %edi
771# else
772	/* Skip ECX bytes.  */
773	shrq	%cl, %rdi
774# endif
7751:
776	/* Before jumping back to the loop, set ESI to the number of
777	   VEC_SIZE * 4 blocks before page crossing.  */
778	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
779
780	testq	%rdi, %rdi
781# ifdef USE_AS_STRNCMP
782	/* At this point, if %rdi value is 0, it already tested
783	   VEC_SIZE*4+%r10 byte starting from %rax. This label
784	   checks whether strncmp maximum offset reached or not.  */
785	je	L(string_nbyte_offset_check)
786# else
787	je	L(back_to_loop)
788# endif
789	tzcntq	%rdi, %rcx
790# ifdef USE_AS_WCSCMP
791	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
792	sall	$2, %ecx
793# endif
794	addq	%r10, %rcx
795	/* Adjust for number of bytes skipped.  */
796	addq	%r8, %rcx
797# ifdef USE_AS_STRNCMP
798	addq	$(VEC_SIZE * 2), %rcx
799	subq	%rcx, %r11
800	jbe	L(zero)
801#  ifdef USE_AS_WCSCMP
802	movq	%rax, %rsi
803	xorl	%eax, %eax
804	movl	(%rsi, %rcx), %edi
805	cmpl	(%rdx, %rcx), %edi
806	jne	L(wcscmp_return)
807#  else
808	movzbl	(%rax, %rcx), %eax
809	movzbl	(%rdx, %rcx), %edx
810	subl	%edx, %eax
811#  endif
812# else
813#  ifdef USE_AS_WCSCMP
814	movq	%rax, %rsi
815	xorl	%eax, %eax
816	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
817	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
818	jne	L(wcscmp_return)
819#  else
820	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
821	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
822	subl	%edx, %eax
823#  endif
824# endif
825	ret
826
827# ifdef USE_AS_STRNCMP
828L(string_nbyte_offset_check):
829	leaq	(VEC_SIZE * 4)(%r10), %r10
830	cmpq	%r10, %r11
831	jbe	L(zero)
832	jmp	L(back_to_loop)
833# endif
834
835	.p2align 4
836L(cross_page_loop):
837	/* Check one byte/dword at a time.  */
838# ifdef USE_AS_WCSCMP
839	cmpl	%ecx, %eax
840# else
841	subl	%ecx, %eax
842# endif
843	jne	L(different)
844	addl	$SIZE_OF_CHAR, %edx
845	cmpl	$(VEC_SIZE * 4), %edx
846	je	L(main_loop_header)
847# ifdef USE_AS_STRNCMP
848	cmpq	%r11, %rdx
849	jae	L(zero)
850# endif
851# ifdef USE_AS_WCSCMP
852	movl	(%rdi, %rdx), %eax
853	movl	(%rsi, %rdx), %ecx
854# else
855	movzbl	(%rdi, %rdx), %eax
856	movzbl	(%rsi, %rdx), %ecx
857# endif
858	/* Check null CHAR.  */
859	testl	%eax, %eax
860	jne	L(cross_page_loop)
861	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
862	   comparisons.  */
863	subl	%ecx, %eax
864# ifndef USE_AS_WCSCMP
865L(different):
866# endif
867	ret
868
869# ifdef USE_AS_WCSCMP
870	.p2align 4
871L(different):
872	/* Use movl to avoid modifying EFLAGS.  */
873	movl	$0, %eax
874	setl	%al
875	negl	%eax
876	orl	$1, %eax
877	ret
878# endif
879
880# ifdef USE_AS_STRNCMP
881	.p2align 4
882L(zero):
883	xorl	%eax, %eax
884	ret
885
886	.p2align 4
887L(char0):
888#  ifdef USE_AS_WCSCMP
889	xorl	%eax, %eax
890	movl	(%rdi), %ecx
891	cmpl	(%rsi), %ecx
892	jne	L(wcscmp_return)
893#  else
894	movzbl	(%rsi), %ecx
895	movzbl	(%rdi), %eax
896	subl	%ecx, %eax
897#  endif
898	ret
899# endif
900
901	.p2align 4
902L(last_vector):
903	addq	%rdx, %rdi
904	addq	%rdx, %rsi
905# ifdef USE_AS_STRNCMP
906	subq	%rdx, %r11
907# endif
908	tzcntl	%ecx, %edx
909# ifdef USE_AS_WCSCMP
910	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
911	sall	$2, %edx
912# endif
913# ifdef USE_AS_STRNCMP
914	cmpq	%r11, %rdx
915	jae	L(zero)
916# endif
917# ifdef USE_AS_WCSCMP
918	xorl	%eax, %eax
919	movl	(%rdi, %rdx), %ecx
920	cmpl	(%rsi, %rdx), %ecx
921	jne	L(wcscmp_return)
922# else
923	movzbl	(%rdi, %rdx), %eax
924	movzbl	(%rsi, %rdx), %edx
925	subl	%edx, %eax
926# endif
927	ret
928
929	/* Comparing on page boundary region requires special treatment:
930	   It must done one vector at the time, starting with the wider
931	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
932	   (xmm) still passes the boundary, byte comparison must be done.
933	 */
934	.p2align 4
935L(cross_page):
936	/* Try one ymm vector at a time.  */
937	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
938	jg	L(cross_page_1_vector)
939L(loop_1_vector):
940	VMOVU	(%rdi, %rdx), %YMM0
941
942	VPTESTM	%YMM0, %YMM0, %k2
943	/* Each bit cleared in K1 represents a mismatch or a null CHAR
944	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
945	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
946	kmovd	%k1, %ecx
947# ifdef USE_AS_WCSCMP
948	subl	$0xff, %ecx
949# else
950	incl	%ecx
951# endif
952	jne	L(last_vector)
953
954	addl	$VEC_SIZE, %edx
955
956	addl	$VEC_SIZE, %eax
957# ifdef USE_AS_STRNCMP
958	/* Return 0 if the current offset (%rdx) >= the maximum offset
959	   (%r11).  */
960	cmpq	%r11, %rdx
961	jae	L(zero)
962# endif
963	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
964	jle	L(loop_1_vector)
965L(cross_page_1_vector):
966	/* Less than 32 bytes to check, try one xmm vector.  */
967	cmpl	$(PAGE_SIZE - 16), %eax
968	jg	L(cross_page_1_xmm)
969	VMOVU	(%rdi, %rdx), %XMM0
970
971	VPTESTM	%YMM0, %YMM0, %k2
972	/* Each bit cleared in K1 represents a mismatch or a null CHAR
973	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
974	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
975	kmovd	%k1, %ecx
976# ifdef USE_AS_WCSCMP
977	subl	$0xf, %ecx
978# else
979	subl	$0xffff, %ecx
980# endif
981	jne	L(last_vector)
982
983	addl	$16, %edx
984# ifndef USE_AS_WCSCMP
985	addl	$16, %eax
986# endif
987# ifdef USE_AS_STRNCMP
988	/* Return 0 if the current offset (%rdx) >= the maximum offset
989	   (%r11).  */
990	cmpq	%r11, %rdx
991	jae	L(zero)
992# endif
993
994L(cross_page_1_xmm):
995# ifndef USE_AS_WCSCMP
996	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
997	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
998	cmpl	$(PAGE_SIZE - 8), %eax
999	jg	L(cross_page_8bytes)
1000	vmovq	(%rdi, %rdx), %XMM0
1001	vmovq	(%rsi, %rdx), %XMM1
1002
1003	VPTESTM	%YMM0, %YMM0, %k2
1004	/* Each bit cleared in K1 represents a mismatch or a null CHAR
1005	   in XMM0 and XMM1.  */
1006	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
1007	kmovb	%k1, %ecx
1008# ifdef USE_AS_WCSCMP
1009	subl	$0x3, %ecx
1010# else
1011	subl	$0xff, %ecx
1012# endif
1013	jne	L(last_vector)
1014
1015	addl	$8, %edx
1016	addl	$8, %eax
1017#  ifdef USE_AS_STRNCMP
1018	/* Return 0 if the current offset (%rdx) >= the maximum offset
1019	   (%r11).  */
1020	cmpq	%r11, %rdx
1021	jae	L(zero)
1022#  endif
1023
1024L(cross_page_8bytes):
1025	/* Less than 8 bytes to check, try 4 byte vector.  */
1026	cmpl	$(PAGE_SIZE - 4), %eax
1027	jg	L(cross_page_4bytes)
1028	vmovd	(%rdi, %rdx), %XMM0
1029	vmovd	(%rsi, %rdx), %XMM1
1030
1031	VPTESTM	%YMM0, %YMM0, %k2
1032	/* Each bit cleared in K1 represents a mismatch or a null CHAR
1033	   in XMM0 and XMM1.  */
1034	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
1035	kmovd	%k1, %ecx
1036# ifdef USE_AS_WCSCMP
1037	subl	$0x1, %ecx
1038# else
1039	subl	$0xf, %ecx
1040# endif
1041	jne	L(last_vector)
1042
1043	addl	$4, %edx
1044#  ifdef USE_AS_STRNCMP
1045	/* Return 0 if the current offset (%rdx) >= the maximum offset
1046	   (%r11).  */
1047	cmpq	%r11, %rdx
1048	jae	L(zero)
1049#  endif
1050
1051L(cross_page_4bytes):
1052# endif
1053	/* Less than 4 bytes to check, try one byte/dword at a time.  */
1054# ifdef USE_AS_STRNCMP
1055	cmpq	%r11, %rdx
1056	jae	L(zero)
1057# endif
1058# ifdef USE_AS_WCSCMP
1059	movl	(%rdi, %rdx), %eax
1060	movl	(%rsi, %rdx), %ecx
1061# else
1062	movzbl	(%rdi, %rdx), %eax
1063	movzbl	(%rsi, %rdx), %ecx
1064# endif
1065	testl	%eax, %eax
1066	jne	L(cross_page_loop)
1067	subl	%ecx, %eax
1068	ret
1069END (STRCMP)
1070#endif
1071