1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRLEN
24#  define STRLEN	__strlen_avx2
25# endif
26
27# ifdef USE_AS_WCSLEN
28#  define VPCMPEQ	vpcmpeqd
29#  define VPMINU	vpminud
30#  define CHAR_SIZE	4
31# else
32#  define VPCMPEQ	vpcmpeqb
33#  define VPMINU	vpminub
34#  define CHAR_SIZE	1
35# endif
36
37# ifndef VZEROUPPER
38#  define VZEROUPPER	vzeroupper
39# endif
40
41# ifndef SECTION
42#  define SECTION(p)	p##.avx
43# endif
44
45# define VEC_SIZE 32
46# define PAGE_SIZE 4096
47# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
48
49	.section SECTION(.text),"ax",@progbits
50ENTRY (STRLEN)
51# ifdef USE_AS_STRNLEN
52	/* Check zero length.  */
53#  ifdef __ILP32__
54	/* Clear upper bits.  */
55	and	%RSI_LP, %RSI_LP
56#  else
57	test	%RSI_LP, %RSI_LP
58#  endif
59	jz	L(zero)
60	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
61	mov	%RSI_LP, %R8_LP
62# endif
63	movl	%edi, %eax
64	movq	%rdi, %rdx
65	vpxor	%xmm0, %xmm0, %xmm0
66	/* Clear high bits from edi. Only keeping bits relevant to page
67	   cross check.  */
68	andl	$(PAGE_SIZE - 1), %eax
69	/* Check if we may cross page boundary with one vector load.  */
70	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
71	ja	L(cross_page_boundary)
72
73	/* Check the first VEC_SIZE bytes.  */
74	VPCMPEQ	(%rdi), %ymm0, %ymm1
75	vpmovmskb %ymm1, %eax
76# ifdef USE_AS_STRNLEN
77	/* If length < VEC_SIZE handle special.  */
78	cmpq	$CHAR_PER_VEC, %rsi
79	jbe	L(first_vec_x0)
80# endif
81	/* If empty continue to aligned_more. Otherwise return bit
82	   position of first match.  */
83	testl	%eax, %eax
84	jz	L(aligned_more)
85	tzcntl	%eax, %eax
86# ifdef USE_AS_WCSLEN
87	/* NB: Divide bytes by 4 to get wchar_t count.  */
88	shrl	$2, %eax
89# endif
90	VZEROUPPER_RETURN
91
92# ifdef USE_AS_STRNLEN
93L(zero):
94	xorl	%eax, %eax
95	ret
96
97	.p2align 4
98L(first_vec_x0):
99	/* Set bit for max len so that tzcnt will return min of max len
100	   and position of first match.  */
101#  ifdef USE_AS_WCSLEN
102	/* NB: Multiply length by 4 to get byte count.  */
103	sall	$2, %esi
104#  endif
105	btsq	%rsi, %rax
106	tzcntl	%eax, %eax
107#  ifdef USE_AS_WCSLEN
108	/* NB: Divide bytes by 4 to get wchar_t count.  */
109	shrl	$2, %eax
110#  endif
111	VZEROUPPER_RETURN
112# endif
113
114	.p2align 4
115L(first_vec_x1):
116	tzcntl	%eax, %eax
117	/* Safe to use 32 bit instructions as these are only called for
118	   size = [1, 159].  */
119# ifdef USE_AS_STRNLEN
120	/* Use ecx which was computed earlier to compute correct value.
121	 */
122#  ifdef USE_AS_WCSLEN
123	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
124#  else
125	subl	$(VEC_SIZE * 4 + 1), %ecx
126	addl	%ecx, %eax
127#  endif
128# else
129	subl	%edx, %edi
130	incl	%edi
131	addl	%edi, %eax
132# endif
133# ifdef USE_AS_WCSLEN
134	/* NB: Divide bytes by 4 to get wchar_t count.  */
135	shrl	$2, %eax
136# endif
137	VZEROUPPER_RETURN
138
139	.p2align 4
140L(first_vec_x2):
141	tzcntl	%eax, %eax
142	/* Safe to use 32 bit instructions as these are only called for
143	   size = [1, 159].  */
144# ifdef USE_AS_STRNLEN
145	/* Use ecx which was computed earlier to compute correct value.
146	 */
147#  ifdef USE_AS_WCSLEN
148	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
149#  else
150	subl	$(VEC_SIZE * 3 + 1), %ecx
151	addl	%ecx, %eax
152#  endif
153# else
154	subl	%edx, %edi
155	addl	$(VEC_SIZE + 1), %edi
156	addl	%edi, %eax
157# endif
158# ifdef USE_AS_WCSLEN
159	/* NB: Divide bytes by 4 to get wchar_t count.  */
160	shrl	$2, %eax
161# endif
162	VZEROUPPER_RETURN
163
164	.p2align 4
165L(first_vec_x3):
166	tzcntl	%eax, %eax
167	/* Safe to use 32 bit instructions as these are only called for
168	   size = [1, 159].  */
169# ifdef USE_AS_STRNLEN
170	/* Use ecx which was computed earlier to compute correct value.
171	 */
172#  ifdef USE_AS_WCSLEN
173	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
174#  else
175	subl	$(VEC_SIZE * 2 + 1), %ecx
176	addl	%ecx, %eax
177#  endif
178# else
179	subl	%edx, %edi
180	addl	$(VEC_SIZE * 2 + 1), %edi
181	addl	%edi, %eax
182# endif
183# ifdef USE_AS_WCSLEN
184	/* NB: Divide bytes by 4 to get wchar_t count.  */
185	shrl	$2, %eax
186# endif
187	VZEROUPPER_RETURN
188
189	.p2align 4
190L(first_vec_x4):
191	tzcntl	%eax, %eax
192	/* Safe to use 32 bit instructions as these are only called for
193	   size = [1, 159].  */
194# ifdef USE_AS_STRNLEN
195	/* Use ecx which was computed earlier to compute correct value.
196	 */
197#  ifdef USE_AS_WCSLEN
198	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
199#  else
200	subl	$(VEC_SIZE + 1), %ecx
201	addl	%ecx, %eax
202#  endif
203# else
204	subl	%edx, %edi
205	addl	$(VEC_SIZE * 3 + 1), %edi
206	addl	%edi, %eax
207# endif
208# ifdef USE_AS_WCSLEN
209	/* NB: Divide bytes by 4 to get wchar_t count.  */
210	shrl	$2, %eax
211# endif
212	VZEROUPPER_RETURN
213
214	.p2align 5
215L(aligned_more):
216	/* Align data to VEC_SIZE - 1. This is the same number of
217	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
218	   code on the x4 check.  */
219	orq	$(VEC_SIZE - 1), %rdi
220L(cross_page_continue):
221	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
222	   since data is only aligned to VEC_SIZE.  */
223# ifdef USE_AS_STRNLEN
224	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
225	   because it simplies the logic in last_4x_vec_or_less.  */
226	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
227	subq	%rdx, %rcx
228#  ifdef USE_AS_WCSLEN
229	/* NB: Divide bytes by 4 to get the wchar_t count.  */
230	sarl	$2, %ecx
231#  endif
232# endif
233	/* Load first VEC regardless.  */
234	VPCMPEQ	1(%rdi), %ymm0, %ymm1
235# ifdef USE_AS_STRNLEN
236	/* Adjust length. If near end handle specially.  */
237	subq	%rcx, %rsi
238	jb	L(last_4x_vec_or_less)
239# endif
240	vpmovmskb %ymm1, %eax
241	testl	%eax, %eax
242	jnz	L(first_vec_x1)
243
244	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
245	vpmovmskb %ymm1, %eax
246	testl	%eax, %eax
247	jnz	L(first_vec_x2)
248
249	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
250	vpmovmskb %ymm1, %eax
251	testl	%eax, %eax
252	jnz	L(first_vec_x3)
253
254	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
255	vpmovmskb %ymm1, %eax
256	testl	%eax, %eax
257	jnz	L(first_vec_x4)
258
259	/* Align data to VEC_SIZE * 4 - 1.  */
260# ifdef USE_AS_STRNLEN
261	/* Before adjusting length check if at last VEC_SIZE * 4.  */
262	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
263	jbe	L(last_4x_vec_or_less_load)
264	incq	%rdi
265	movl	%edi, %ecx
266	orq	$(VEC_SIZE * 4 - 1), %rdi
267	andl	$(VEC_SIZE * 4 - 1), %ecx
268#  ifdef USE_AS_WCSLEN
269	/* NB: Divide bytes by 4 to get the wchar_t count.  */
270	sarl	$2, %ecx
271#  endif
272	/* Readjust length.  */
273	addq	%rcx, %rsi
274# else
275	incq	%rdi
276	orq	$(VEC_SIZE * 4 - 1), %rdi
277# endif
278	/* Compare 4 * VEC at a time forward.  */
279	.p2align 4
280L(loop_4x_vec):
281# ifdef USE_AS_STRNLEN
282	/* Break if at end of length.  */
283	subq	$(CHAR_PER_VEC * 4), %rsi
284	jb	L(last_4x_vec_or_less_cmpeq)
285# endif
286	/* Save some code size by microfusing VPMINU with the load.
287	   Since the matches in ymm2/ymm4 can only be returned if there
288	   where no matches in ymm1/ymm3 respectively there is no issue
289	   with overlap.  */
290	vmovdqa	1(%rdi), %ymm1
291	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
292	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
293	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
294
295	VPMINU	%ymm2, %ymm4, %ymm5
296	VPCMPEQ	%ymm5, %ymm0, %ymm5
297	vpmovmskb %ymm5, %ecx
298
299	subq	$-(VEC_SIZE * 4), %rdi
300	testl	%ecx, %ecx
301	jz	L(loop_4x_vec)
302
303
304	VPCMPEQ	%ymm1, %ymm0, %ymm1
305	vpmovmskb %ymm1, %eax
306	subq	%rdx, %rdi
307	testl	%eax, %eax
308	jnz	L(last_vec_return_x0)
309
310	VPCMPEQ	%ymm2, %ymm0, %ymm2
311	vpmovmskb %ymm2, %eax
312	testl	%eax, %eax
313	jnz	L(last_vec_return_x1)
314
315	/* Combine last 2 VEC.  */
316	VPCMPEQ	%ymm3, %ymm0, %ymm3
317	vpmovmskb %ymm3, %eax
318	/* rcx has combined result from all 4 VEC. It will only be used
319	   if the first 3 other VEC all did not contain a match.  */
320	salq	$32, %rcx
321	orq	%rcx, %rax
322	tzcntq	%rax, %rax
323	subq	$(VEC_SIZE * 2 - 1), %rdi
324	addq	%rdi, %rax
325# ifdef USE_AS_WCSLEN
326	/* NB: Divide bytes by 4 to get wchar_t count.  */
327	shrq	$2, %rax
328# endif
329	VZEROUPPER_RETURN
330
331
332# ifdef USE_AS_STRNLEN
333	.p2align 4
334L(last_4x_vec_or_less_load):
335	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
336	 */
337	subq	$-(VEC_SIZE * 4), %rdi
338L(last_4x_vec_or_less_cmpeq):
339	VPCMPEQ	1(%rdi), %ymm0, %ymm1
340L(last_4x_vec_or_less):
341#  ifdef USE_AS_WCSLEN
342	/* NB: Multiply length by 4 to get byte count.  */
343	sall	$2, %esi
344#  endif
345	vpmovmskb %ymm1, %eax
346	/* If remaining length > VEC_SIZE * 2. This works if esi is off
347	   by VEC_SIZE * 4.  */
348	testl	$(VEC_SIZE * 2), %esi
349	jnz	L(last_4x_vec)
350
351	/* length may have been negative or positive by an offset of
352	   VEC_SIZE * 4 depending on where this was called from. This fixes
353	   that.  */
354	andl	$(VEC_SIZE * 4 - 1), %esi
355	testl	%eax, %eax
356	jnz	L(last_vec_x1_check)
357
358	subl	$VEC_SIZE, %esi
359	jb	L(max)
360
361	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
362	vpmovmskb %ymm1, %eax
363	tzcntl	%eax, %eax
364	/* Check the end of data.  */
365	cmpl	%eax, %esi
366	jb	L(max)
367	subq	%rdx, %rdi
368	addl	$(VEC_SIZE + 1), %eax
369	addq	%rdi, %rax
370#  ifdef USE_AS_WCSLEN
371	/* NB: Divide bytes by 4 to get wchar_t count.  */
372	shrq	$2, %rax
373#  endif
374	VZEROUPPER_RETURN
375# endif
376
377	.p2align 4
378L(last_vec_return_x0):
379	tzcntl	%eax, %eax
380	subq	$(VEC_SIZE * 4 - 1), %rdi
381	addq	%rdi, %rax
382# ifdef USE_AS_WCSLEN
383	/* NB: Divide bytes by 4 to get wchar_t count.  */
384	shrq	$2, %rax
385# endif
386	VZEROUPPER_RETURN
387
388	.p2align 4
389L(last_vec_return_x1):
390	tzcntl	%eax, %eax
391	subq	$(VEC_SIZE * 3 - 1), %rdi
392	addq	%rdi, %rax
393# ifdef USE_AS_WCSLEN
394	/* NB: Divide bytes by 4 to get wchar_t count.  */
395	shrq	$2, %rax
396# endif
397	VZEROUPPER_RETURN
398
399# ifdef USE_AS_STRNLEN
400	.p2align 4
401L(last_vec_x1_check):
402
403	tzcntl	%eax, %eax
404	/* Check the end of data.  */
405	cmpl	%eax, %esi
406	jb	L(max)
407	subq	%rdx, %rdi
408	incl	%eax
409	addq	%rdi, %rax
410#  ifdef USE_AS_WCSLEN
411	/* NB: Divide bytes by 4 to get wchar_t count.  */
412	shrq	$2, %rax
413#  endif
414	VZEROUPPER_RETURN
415
416L(max):
417	movq	%r8, %rax
418	VZEROUPPER_RETURN
419
420	.p2align 4
421L(last_4x_vec):
422	/* Test first 2x VEC normally.  */
423	testl	%eax, %eax
424	jnz	L(last_vec_x1)
425
426	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
427	vpmovmskb %ymm1, %eax
428	testl	%eax, %eax
429	jnz	L(last_vec_x2)
430
431	/* Normalize length.  */
432	andl	$(VEC_SIZE * 4 - 1), %esi
433	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
434	vpmovmskb %ymm1, %eax
435	testl	%eax, %eax
436	jnz	L(last_vec_x3)
437
438	subl	$(VEC_SIZE * 3), %esi
439	jb	L(max)
440
441	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
442	vpmovmskb %ymm1, %eax
443	tzcntl	%eax, %eax
444	/* Check the end of data.  */
445	cmpl	%eax, %esi
446	jb	L(max)
447	subq	%rdx, %rdi
448	addl	$(VEC_SIZE * 3 + 1), %eax
449	addq	%rdi, %rax
450#  ifdef USE_AS_WCSLEN
451	/* NB: Divide bytes by 4 to get wchar_t count.  */
452	shrq	$2, %rax
453#  endif
454	VZEROUPPER_RETURN
455
456
457	.p2align 4
458L(last_vec_x1):
459	/* essentially duplicates of first_vec_x1 but use 64 bit
460	   instructions.  */
461	tzcntl	%eax, %eax
462	subq	%rdx, %rdi
463	incl	%eax
464	addq	%rdi, %rax
465#  ifdef USE_AS_WCSLEN
466	/* NB: Divide bytes by 4 to get wchar_t count.  */
467	shrq	$2, %rax
468#  endif
469	VZEROUPPER_RETURN
470
471	.p2align 4
472L(last_vec_x2):
473	/* essentially duplicates of first_vec_x1 but use 64 bit
474	   instructions.  */
475	tzcntl	%eax, %eax
476	subq	%rdx, %rdi
477	addl	$(VEC_SIZE + 1), %eax
478	addq	%rdi, %rax
479#  ifdef USE_AS_WCSLEN
480	/* NB: Divide bytes by 4 to get wchar_t count.  */
481	shrq	$2, %rax
482#  endif
483	VZEROUPPER_RETURN
484
485	.p2align 4
486L(last_vec_x3):
487	tzcntl	%eax, %eax
488	subl	$(VEC_SIZE * 2), %esi
489	/* Check the end of data.  */
490	cmpl	%eax, %esi
491	jb	L(max_end)
492	subq	%rdx, %rdi
493	addl	$(VEC_SIZE * 2 + 1), %eax
494	addq	%rdi, %rax
495#  ifdef USE_AS_WCSLEN
496	/* NB: Divide bytes by 4 to get wchar_t count.  */
497	shrq	$2, %rax
498#  endif
499	VZEROUPPER_RETURN
500L(max_end):
501	movq	%r8, %rax
502	VZEROUPPER_RETURN
503# endif
504
505	/* Cold case for crossing page with first load.  */
506	.p2align 4
507L(cross_page_boundary):
508	/* Align data to VEC_SIZE - 1.  */
509	orq	$(VEC_SIZE - 1), %rdi
510	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
511	vpmovmskb %ymm1, %eax
512	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
513	   so no need to manually mod rdx.  */
514	sarxl	%edx, %eax, %eax
515# ifdef USE_AS_STRNLEN
516	testl	%eax, %eax
517	jnz	L(cross_page_less_vec)
518	leaq	1(%rdi), %rcx
519	subq	%rdx, %rcx
520#  ifdef USE_AS_WCSLEN
521	/* NB: Divide bytes by 4 to get wchar_t count.  */
522	shrl	$2, %ecx
523#  endif
524	/* Check length.  */
525	cmpq	%rsi, %rcx
526	jb	L(cross_page_continue)
527	movq	%r8, %rax
528# else
529	testl	%eax, %eax
530	jz	L(cross_page_continue)
531	tzcntl	%eax, %eax
532#  ifdef USE_AS_WCSLEN
533	/* NB: Divide length by 4 to get wchar_t count.  */
534	shrl	$2, %eax
535#  endif
536# endif
537L(return_vzeroupper):
538	ZERO_UPPER_VEC_REGISTERS_RETURN
539
540# ifdef USE_AS_STRNLEN
541	.p2align 4
542L(cross_page_less_vec):
543	tzcntl	%eax, %eax
544#  ifdef USE_AS_WCSLEN
545	/* NB: Multiply length by 4 to get byte count.  */
546	sall	$2, %esi
547#  endif
548	cmpq	%rax, %rsi
549	cmovb	%esi, %eax
550#  ifdef USE_AS_WCSLEN
551	shrl	$2, %eax
552#  endif
553	VZEROUPPER_RETURN
554# endif
555
556END (STRLEN)
557#endif
558