1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21/* memcmp/wmemcmp is implemented as:
22   1. Use ymm vector compares when possible. The only case where
23      vector compares is not possible for when size < CHAR_PER_VEC
24      and loading from either s1 or s2 would cause a page cross.
25   2. For size from 2 to 7 bytes on page cross, load as big endian
26      with movbe and bswap to avoid branches.
27   3. Use xmm vector compare when size >= 4 bytes for memcmp or
28      size >= 8 bytes for wmemcmp.
29   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
30      to check for early mismatches. Only do this if its guranteed the
31      work is not wasted.
32   5. If size is 8 * VEC_SIZE or less, unroll the loop.
33   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
34      area.
35   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
36   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
37   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
38
39When possible the implementation tries to optimize for frontend in the
40following ways:
41Throughput:
42    1. All code sections that fit are able to run optimally out of the
43       LSD.
44    2. All code sections that fit are able to run optimally out of the
45       DSB
46    3. Basic blocks are contained in minimum number of fetch blocks
47       necessary.
48
49Latency:
50    1. Logically connected basic blocks are put in the same
51       cache-line.
52    2. Logically connected basic blocks that do not fit in the same
53       cache-line are put in adjacent lines. This can get beneficial
54       L2 spatial prefetching and L1 next-line prefetching.  */
55
56# include <sysdep.h>
57
58# ifndef MEMCMP
59#  define MEMCMP	__memcmp_evex_movbe
60# endif
61
62# define VMOVU		vmovdqu64
63
64# ifdef USE_AS_WMEMCMP
65#  define VMOVU_MASK	vmovdqu32
66#  define CHAR_SIZE	4
67#  define VPCMP	vpcmpd
68#  define VPTEST	vptestmd
69# else
70#  define VMOVU_MASK	vmovdqu8
71#  define CHAR_SIZE	1
72#  define VPCMP	vpcmpub
73#  define VPTEST	vptestmb
74# endif
75
76
77# define VEC_SIZE	32
78# define PAGE_SIZE	4096
79# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
80
81# define XMM0		xmm16
82# define XMM1		xmm17
83# define XMM2		xmm18
84# define YMM0		ymm16
85# define XMM1		xmm17
86# define XMM2		xmm18
87# define YMM1		ymm17
88# define YMM2		ymm18
89# define YMM3		ymm19
90# define YMM4		ymm20
91# define YMM5		ymm21
92# define YMM6		ymm22
93
94/* Warning!
95           wmemcmp has to use SIGNED comparison for elements.
96           memcmp has to use UNSIGNED comparison for elemnts.
97*/
98
99	.section .text.evex,"ax",@progbits
100/* Cache align memcmp entry. This allows for much more thorough
101   frontend optimization.  */
102ENTRY_P2ALIGN (MEMCMP, 6)
103# ifdef __ILP32__
104	/* Clear the upper 32 bits.  */
105	movl	%edx, %edx
106# endif
107	cmp	$CHAR_PER_VEC, %RDX_LP
108	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
109	ja	L(more_1x_vec)
110
111	/* Create mask for CHAR's we want to compare. This allows us to
112	   avoid having to include page cross logic.  */
113	movl	$-1, %ecx
114	bzhil	%edx, %ecx, %ecx
115	kmovd	%ecx, %k2
116
117	/* Safe to load full ymm with mask.  */
118	VMOVU_MASK (%rsi), %YMM2{%k2}
119	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
120	kmovd	%k1, %eax
121	testl	%eax, %eax
122	jnz	L(return_vec_0)
123	ret
124
125	.p2align 4
126L(return_vec_0):
127	tzcntl	%eax, %eax
128# ifdef USE_AS_WMEMCMP
129	movl	(%rdi, %rax, CHAR_SIZE), %ecx
130	xorl	%edx, %edx
131	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
132	/* NB: no partial register stall here because xorl zero idiom
133	   above.  */
134	setg	%dl
135	leal	-1(%rdx, %rdx), %eax
136# else
137	movzbl	(%rsi, %rax), %ecx
138	movzbl	(%rdi, %rax), %eax
139	subl	%ecx, %eax
140# endif
141	ret
142
143
144	.p2align 4
145L(more_1x_vec):
146	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
147	VMOVU	(%rsi), %YMM1
148	/* Use compare not equals to directly check for mismatch.  */
149	VPCMP	$4,(%rdi), %YMM1, %k1
150	kmovd	%k1, %eax
151	/* NB: eax must be destination register if going to
152	   L(return_vec_[0,2]). For L(return_vec_3) destination register
153	   must be ecx.  */
154	testl	%eax, %eax
155	jnz	L(return_vec_0)
156
157	cmpq	$(CHAR_PER_VEC * 2), %rdx
158	jbe	L(last_1x_vec)
159
160	/* Check second VEC no matter what.  */
161	VMOVU	VEC_SIZE(%rsi), %YMM2
162	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
163	kmovd	%k1, %eax
164	testl	%eax, %eax
165	jnz	L(return_vec_1)
166
167	/* Less than 4 * VEC.  */
168	cmpq	$(CHAR_PER_VEC * 4), %rdx
169	jbe	L(last_2x_vec)
170
171	/* Check third and fourth VEC no matter what.  */
172	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
173	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
174	kmovd	%k1, %eax
175	testl	%eax, %eax
176	jnz	L(return_vec_2)
177
178	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
179	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
180	kmovd	%k1, %ecx
181	testl	%ecx, %ecx
182	jnz	L(return_vec_3)
183
184	/* Go to 4x VEC loop.  */
185	cmpq	$(CHAR_PER_VEC * 8), %rdx
186	ja	L(more_8x_vec)
187
188	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
189	   branches.  */
190
191	/* Load first two VEC from s2 before adjusting addresses.  */
192	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
193	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
194	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
195	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
196
197	/* Wait to load from s1 until addressed adjust due to
198	   unlamination of microfusion with complex address mode.  */
199
200	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
201	   will have some 1s.  */
202	vpxorq	(%rdi), %YMM1, %YMM1
203	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
204
205	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
206	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
207
208	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
209	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
210	   oring with YMM1. Result is stored in YMM4.  */
211	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
212
213	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
214	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
215
216	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
217	 */
218	VPTEST	%YMM4, %YMM4, %k1
219	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
220	kmovd	%k1, %ecx
221	testl	%ecx, %ecx
222	jnz	L(return_vec_0_1_2_3)
223	/* NB: eax must be zero to reach here.  */
224	ret
225
226
227	.p2align 4,, 8
228L(8x_end_return_vec_0_1_2_3):
229	movq	%rdx, %rdi
230L(8x_return_vec_0_1_2_3):
231	addq	%rdi, %rsi
232L(return_vec_0_1_2_3):
233	VPTEST	%YMM1, %YMM1, %k0
234	kmovd	%k0, %eax
235	testl	%eax, %eax
236	jnz	L(return_vec_0)
237
238	VPTEST	%YMM2, %YMM2, %k0
239	kmovd	%k0, %eax
240	testl	%eax, %eax
241	jnz	L(return_vec_1)
242
243	VPTEST	%YMM3, %YMM3, %k0
244	kmovd	%k0, %eax
245	testl	%eax, %eax
246	jnz	L(return_vec_2)
247L(return_vec_3):
248	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
249	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
250	   line.  */
251	bsfl	%ecx, %ecx
252# ifdef USE_AS_WMEMCMP
253	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
254	xorl	%edx, %edx
255	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
256	setg	%dl
257	leal	-1(%rdx, %rdx), %eax
258# else
259	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
260	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
261	subl	%ecx, %eax
262# endif
263	ret
264
265
266	.p2align 4
267L(return_vec_1):
268	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
269	   fetch block.  */
270	bsfl	%eax, %eax
271# ifdef USE_AS_WMEMCMP
272	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
273	xorl	%edx, %edx
274	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
275	setg	%dl
276	leal	-1(%rdx, %rdx), %eax
277# else
278	movzbl	VEC_SIZE(%rsi, %rax), %ecx
279	movzbl	VEC_SIZE(%rdi, %rax), %eax
280	subl	%ecx, %eax
281# endif
282	ret
283
284	.p2align 4,, 10
285L(return_vec_2):
286	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
287	   fetch block.  */
288	bsfl	%eax, %eax
289# ifdef USE_AS_WMEMCMP
290	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
291	xorl	%edx, %edx
292	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
293	setg	%dl
294	leal	-1(%rdx, %rdx), %eax
295# else
296	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
297	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
298	subl	%ecx, %eax
299# endif
300	ret
301
302	.p2align 4
303L(more_8x_vec):
304	/* Set end of s1 in rdx.  */
305	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
306	/* rsi stores s2 - s1. This allows loop to only update one
307	   pointer.  */
308	subq	%rdi, %rsi
309	/* Align s1 pointer.  */
310	andq	$-VEC_SIZE, %rdi
311	/* Adjust because first 4x vec where check already.  */
312	subq	$-(VEC_SIZE * 4), %rdi
313
314	.p2align 4
315L(loop_4x_vec):
316	VMOVU	(%rsi, %rdi), %YMM1
317	vpxorq	(%rdi), %YMM1, %YMM1
318	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
319	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
320	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
321	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
322	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
323	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
324	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
325	VPTEST	%YMM4, %YMM4, %k1
326	kmovd	%k1, %ecx
327	testl	%ecx, %ecx
328	jnz	L(8x_return_vec_0_1_2_3)
329	subq	$-(VEC_SIZE * 4), %rdi
330	cmpq	%rdx, %rdi
331	jb	L(loop_4x_vec)
332
333	subq	%rdx, %rdi
334	/* rdi has 4 * VEC_SIZE - remaining length.  */
335	cmpl	$(VEC_SIZE * 3), %edi
336	jae	L(8x_last_1x_vec)
337	/* Load regardless of branch.  */
338	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
339	cmpl	$(VEC_SIZE * 2), %edi
340	jae	L(8x_last_2x_vec)
341
342	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
343
344	VMOVU	(%rsi, %rdx), %YMM1
345	vpxorq	(%rdx), %YMM1, %YMM1
346
347	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
348	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
349	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
350	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
351	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
352	VPTEST	%YMM4, %YMM4, %k1
353	kmovd	%k1, %ecx
354	testl	%ecx, %ecx
355	jnz	L(8x_end_return_vec_0_1_2_3)
356	/* NB: eax must be zero to reach here.  */
357	ret
358
359	/* Only entry is from L(more_8x_vec).  */
360	.p2align 4,, 10
361L(8x_last_2x_vec):
362	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
363	kmovd	%k1, %eax
364	testl	%eax, %eax
365	jnz	L(8x_return_vec_2)
366	/* Naturally aligned to 16 bytes.  */
367L(8x_last_1x_vec):
368	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
369	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
370	kmovd	%k1, %eax
371	testl	%eax, %eax
372	jnz	L(8x_return_vec_3)
373	ret
374
375	/* Not ideally aligned (at offset +9 bytes in fetch block) but
376	   not aligning keeps it in the same cache line as
377	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
378	   size.  */
379	.p2align 4,, 4
380L(8x_return_vec_2):
381	subq	$VEC_SIZE, %rdx
382L(8x_return_vec_3):
383	bsfl	%eax, %eax
384# ifdef USE_AS_WMEMCMP
385	leaq	(%rdx, %rax, CHAR_SIZE), %rax
386	movl	(VEC_SIZE * 3)(%rax), %ecx
387	xorl	%edx, %edx
388	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
389	setg	%dl
390	leal	-1(%rdx, %rdx), %eax
391# else
392	addq	%rdx, %rax
393	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
394	movzbl	(VEC_SIZE * 3)(%rax), %eax
395	subl	%ecx, %eax
396# endif
397	ret
398
399	.p2align 4,, 10
400L(last_2x_vec):
401	/* Check second to last VEC.  */
402	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
403	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
404	kmovd	%k1, %eax
405	testl	%eax, %eax
406	jnz	L(return_vec_1_end)
407
408	/* Check last VEC.  */
409	.p2align 4
410L(last_1x_vec):
411	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
412	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
413	kmovd	%k1, %eax
414	testl	%eax, %eax
415	jnz	L(return_vec_0_end)
416	ret
417
418
419	/* Don't align. Takes 2-fetch blocks either way and aligning
420	   will cause code to spill into another cacheline.  */
421L(return_vec_1_end):
422	/* Use bsf to save code size. This is necessary to have
423	   L(one_or_less) fit in aligning bytes between.  */
424	bsfl	%eax, %eax
425	addl	%edx, %eax
426# ifdef USE_AS_WMEMCMP
427	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
428	xorl	%edx, %edx
429	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
430	setg	%dl
431	leal	-1(%rdx, %rdx), %eax
432# else
433	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
434	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
435	subl	%ecx, %eax
436# endif
437	ret
438
439	/* Don't align. Takes 2-fetch blocks either way and aligning
440	   will cause code to spill into another cacheline.  */
441L(return_vec_0_end):
442	tzcntl	%eax, %eax
443	addl	%edx, %eax
444# ifdef USE_AS_WMEMCMP
445	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
446	xorl	%edx, %edx
447	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
448	setg	%dl
449	leal	-1(%rdx, %rdx), %eax
450# else
451	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
452	movzbl	-VEC_SIZE(%rdi, %rax), %eax
453	subl	%ecx, %eax
454# endif
455	ret
456	/* 1-byte until next cache line.  */
457
458END (MEMCMP)
459#endif
460