1/* memchr/wmemchr optimized with AVX2.
2   Copyright (C) 2017-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24#  define MEMCHR	__memchr_avx2
25# endif
26
27# ifdef USE_AS_WMEMCHR
28#  define VPCMPEQ	vpcmpeqd
29#  define VPBROADCAST	vpbroadcastd
30#  define CHAR_SIZE	4
31# else
32#  define VPCMPEQ	vpcmpeqb
33#  define VPBROADCAST	vpbroadcastb
34#  define CHAR_SIZE	1
35# endif
36
37# ifdef USE_AS_RAWMEMCHR
38#  define ERAW_PTR_REG	ecx
39#  define RRAW_PTR_REG	rcx
40#  define ALGN_PTR_REG	rdi
41# else
42#  define ERAW_PTR_REG	edi
43#  define RRAW_PTR_REG	rdi
44#  define ALGN_PTR_REG	rcx
45# endif
46
47# ifndef VZEROUPPER
48#  define VZEROUPPER	vzeroupper
49# endif
50
51# ifndef SECTION
52#  define SECTION(p)	p##.avx
53# endif
54
55# define VEC_SIZE 32
56# define PAGE_SIZE 4096
57# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
58
59	.section SECTION(.text),"ax",@progbits
60ENTRY (MEMCHR)
61# ifndef USE_AS_RAWMEMCHR
62	/* Check for zero length.  */
63#  ifdef __ILP32__
64	/* Clear upper bits.  */
65	and	%RDX_LP, %RDX_LP
66#  else
67	test	%RDX_LP, %RDX_LP
68#  endif
69	jz	L(null)
70# endif
71	/* Broadcast CHAR to YMMMATCH.  */
72	vmovd	%esi, %xmm0
73	VPBROADCAST %xmm0, %ymm0
74	/* Check if we may cross page boundary with one vector load.  */
75	movl	%edi, %eax
76	andl	$(PAGE_SIZE - 1), %eax
77	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
78	ja	L(cross_page_boundary)
79
80	/* Check the first VEC_SIZE bytes.  */
81	VPCMPEQ	(%rdi), %ymm0, %ymm1
82	vpmovmskb %ymm1, %eax
83# ifndef USE_AS_RAWMEMCHR
84	/* If length < CHAR_PER_VEC handle special.  */
85	cmpq	$CHAR_PER_VEC, %rdx
86	jbe	L(first_vec_x0)
87# endif
88	testl	%eax, %eax
89	jz	L(aligned_more)
90	tzcntl	%eax, %eax
91	addq	%rdi, %rax
92	VZEROUPPER_RETURN
93
94# ifndef USE_AS_RAWMEMCHR
95	.p2align 5
96L(first_vec_x0):
97	/* Check if first match was before length.  */
98	tzcntl	%eax, %eax
99#  ifdef USE_AS_WMEMCHR
100	/* NB: Multiply length by 4 to get byte count.  */
101	sall	$2, %edx
102#  endif
103	xorl	%ecx, %ecx
104	cmpl	%eax, %edx
105	leaq	(%rdi, %rax), %rax
106	cmovle	%rcx, %rax
107	VZEROUPPER_RETURN
108
109L(null):
110	xorl	%eax, %eax
111	ret
112# endif
113	.p2align 4
114L(cross_page_boundary):
115	/* Save pointer before aligning as its original value is
116	   necessary for computer return address if byte is found or
117	   adjusting length if it is not and this is memchr.  */
118	movq	%rdi, %rcx
119	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
120	   and rdi for rawmemchr.  */
121	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
122	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
123	vpmovmskb %ymm1, %eax
124# ifndef USE_AS_RAWMEMCHR
125	/* Calculate length until end of page (length checked for a
126	   match).  */
127	leaq	1(%ALGN_PTR_REG), %rsi
128	subq	%RRAW_PTR_REG, %rsi
129#  ifdef USE_AS_WMEMCHR
130	/* NB: Divide bytes by 4 to get wchar_t count.  */
131	shrl	$2, %esi
132#  endif
133# endif
134	/* Remove the leading bytes.  */
135	sarxl	%ERAW_PTR_REG, %eax, %eax
136# ifndef USE_AS_RAWMEMCHR
137	/* Check the end of data.  */
138	cmpq	%rsi, %rdx
139	jbe	L(first_vec_x0)
140# endif
141	testl	%eax, %eax
142	jz	L(cross_page_continue)
143	tzcntl	%eax, %eax
144	addq	%RRAW_PTR_REG, %rax
145L(return_vzeroupper):
146	ZERO_UPPER_VEC_REGISTERS_RETURN
147
148	.p2align 4
149L(first_vec_x1):
150	tzcntl	%eax, %eax
151	incq	%rdi
152	addq	%rdi, %rax
153	VZEROUPPER_RETURN
154
155	.p2align 4
156L(first_vec_x2):
157	tzcntl	%eax, %eax
158	addq	$(VEC_SIZE + 1), %rdi
159	addq	%rdi, %rax
160	VZEROUPPER_RETURN
161
162	.p2align 4
163L(first_vec_x3):
164	tzcntl	%eax, %eax
165	addq	$(VEC_SIZE * 2 + 1), %rdi
166	addq	%rdi, %rax
167	VZEROUPPER_RETURN
168
169
170	.p2align 4
171L(first_vec_x4):
172	tzcntl	%eax, %eax
173	addq	$(VEC_SIZE * 3 + 1), %rdi
174	addq	%rdi, %rax
175	VZEROUPPER_RETURN
176
177	.p2align 4
178L(aligned_more):
179	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
180	   since data is only aligned to VEC_SIZE.  */
181
182# ifndef USE_AS_RAWMEMCHR
183L(cross_page_continue):
184	/* Align data to VEC_SIZE - 1.  */
185	xorl	%ecx, %ecx
186	subl	%edi, %ecx
187	orq	$(VEC_SIZE - 1), %rdi
188	/* esi is for adjusting length to see if near the end.  */
189	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
190#  ifdef USE_AS_WMEMCHR
191	/* NB: Divide bytes by 4 to get the wchar_t count.  */
192	sarl	$2, %esi
193#  endif
194# else
195	orq	$(VEC_SIZE - 1), %rdi
196L(cross_page_continue):
197# endif
198	/* Load first VEC regardless.  */
199	VPCMPEQ	1(%rdi), %ymm0, %ymm1
200	vpmovmskb %ymm1, %eax
201# ifndef USE_AS_RAWMEMCHR
202	/* Adjust length. If near end handle specially.  */
203	subq	%rsi, %rdx
204	jbe	L(last_4x_vec_or_less)
205# endif
206	testl	%eax, %eax
207	jnz	L(first_vec_x1)
208
209	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
210	vpmovmskb %ymm1, %eax
211	testl	%eax, %eax
212	jnz	L(first_vec_x2)
213
214	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
215	vpmovmskb %ymm1, %eax
216	testl	%eax, %eax
217	jnz	L(first_vec_x3)
218
219	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
220	vpmovmskb %ymm1, %eax
221	testl	%eax, %eax
222	jnz	L(first_vec_x4)
223
224# ifndef USE_AS_RAWMEMCHR
225	/* Check if at last VEC_SIZE * 4 length.  */
226	subq	$(CHAR_PER_VEC * 4), %rdx
227	jbe	L(last_4x_vec_or_less_cmpeq)
228	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
229	   length.  */
230	incq	%rdi
231	movl	%edi, %ecx
232	orq	$(VEC_SIZE * 4 - 1), %rdi
233	andl	$(VEC_SIZE * 4 - 1), %ecx
234#  ifdef USE_AS_WMEMCHR
235	/* NB: Divide bytes by 4 to get the wchar_t count.  */
236	sarl	$2, %ecx
237#  endif
238	addq	%rcx, %rdx
239# else
240	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
241	incq	%rdi
242	orq	$(VEC_SIZE * 4 - 1), %rdi
243# endif
244
245	/* Compare 4 * VEC at a time forward.  */
246	.p2align 4
247L(loop_4x_vec):
248	VPCMPEQ	1(%rdi), %ymm0, %ymm1
249	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
250	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
251	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
252	vpor	%ymm1, %ymm2, %ymm5
253	vpor	%ymm3, %ymm4, %ymm6
254	vpor	%ymm5, %ymm6, %ymm5
255
256	vpmovmskb %ymm5, %ecx
257# ifdef USE_AS_RAWMEMCHR
258	subq	$-(VEC_SIZE * 4), %rdi
259	testl	%ecx, %ecx
260	jz	L(loop_4x_vec)
261# else
262	testl	%ecx, %ecx
263	jnz	L(loop_4x_vec_end)
264
265	subq	$-(VEC_SIZE * 4), %rdi
266
267	subq	$(CHAR_PER_VEC * 4), %rdx
268	ja	L(loop_4x_vec)
269
270	/* Fall through into less than 4 remaining vectors of length
271	   case.  */
272	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
273	vpmovmskb %ymm1, %eax
274	.p2align 4
275L(last_4x_vec_or_less):
276#  ifdef USE_AS_WMEMCHR
277	/* NB: Multiply length by 4 to get byte count.  */
278	sall	$2, %edx
279#  endif
280	/* Check if first VEC contained match.  */
281	testl	%eax, %eax
282	jnz	L(first_vec_x1_check)
283
284	/* If remaining length > VEC_SIZE * 2.  */
285	addl	$(VEC_SIZE * 2), %edx
286	jg	L(last_4x_vec)
287
288L(last_2x_vec):
289	/* If remaining length < VEC_SIZE.  */
290	addl	$VEC_SIZE, %edx
291	jle	L(zero_end)
292
293	/* Check VEC2 and compare any match with remaining length.  */
294	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
295	vpmovmskb %ymm1, %eax
296	tzcntl	%eax, %eax
297	cmpl	%eax, %edx
298	jbe	L(set_zero_end)
299	addq	$(VEC_SIZE + 1), %rdi
300	addq	%rdi, %rax
301L(zero_end):
302	VZEROUPPER_RETURN
303
304	.p2align 4
305L(loop_4x_vec_end):
306# endif
307	/* rawmemchr will fall through into this if match was found in
308	   loop.  */
309
310	vpmovmskb %ymm1, %eax
311	testl	%eax, %eax
312	jnz	L(last_vec_x1_return)
313
314	vpmovmskb %ymm2, %eax
315	testl	%eax, %eax
316	jnz	L(last_vec_x2_return)
317
318	vpmovmskb %ymm3, %eax
319	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
320	salq	$32, %rcx
321	orq	%rcx, %rax
322	tzcntq	%rax, %rax
323# ifdef USE_AS_RAWMEMCHR
324	subq	$(VEC_SIZE * 2 - 1), %rdi
325# else
326	subq	$-(VEC_SIZE * 2 + 1), %rdi
327# endif
328	addq	%rdi, %rax
329	VZEROUPPER_RETURN
330# ifndef USE_AS_RAWMEMCHR
331
332	.p2align 4
333L(first_vec_x1_check):
334	tzcntl	%eax, %eax
335	/* Adjust length.  */
336	subl	$-(VEC_SIZE * 4), %edx
337	/* Check if match within remaining length.  */
338	cmpl	%eax, %edx
339	jbe	L(set_zero_end)
340	incq	%rdi
341	addq	%rdi, %rax
342	VZEROUPPER_RETURN
343	.p2align 4
344L(set_zero_end):
345	xorl	%eax, %eax
346	VZEROUPPER_RETURN
347# endif
348
349	.p2align 4
350L(last_vec_x1_return):
351	tzcntl	%eax, %eax
352# ifdef USE_AS_RAWMEMCHR
353	subq	$(VEC_SIZE * 4 - 1), %rdi
354# else
355	incq	%rdi
356# endif
357	addq	%rdi, %rax
358	VZEROUPPER_RETURN
359
360	.p2align 4
361L(last_vec_x2_return):
362	tzcntl	%eax, %eax
363# ifdef USE_AS_RAWMEMCHR
364	subq	$(VEC_SIZE * 3 - 1), %rdi
365# else
366	subq	$-(VEC_SIZE + 1), %rdi
367# endif
368	addq	%rdi, %rax
369	VZEROUPPER_RETURN
370
371# ifndef USE_AS_RAWMEMCHR
372	.p2align 4
373L(last_4x_vec_or_less_cmpeq):
374	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
375	vpmovmskb %ymm1, %eax
376#  ifdef USE_AS_WMEMCHR
377	/* NB: Multiply length by 4 to get byte count.  */
378	sall	$2, %edx
379#  endif
380	subq	$-(VEC_SIZE * 4), %rdi
381	/* Check first VEC regardless.  */
382	testl	%eax, %eax
383	jnz	L(first_vec_x1_check)
384
385	/* If remaining length <= CHAR_PER_VEC * 2.  */
386	addl	$(VEC_SIZE * 2), %edx
387	jle	L(last_2x_vec)
388	.p2align 4
389L(last_4x_vec):
390	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
391	vpmovmskb %ymm1, %eax
392	testl	%eax, %eax
393	jnz	L(last_vec_x2_return)
394
395	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
396	vpmovmskb %ymm1, %eax
397
398	/* Create mask for possible matches within remaining length.  */
399	movq	$-1, %rcx
400	bzhiq	%rdx, %rcx, %rcx
401
402	/* Test matches in data against length match.  */
403	andl	%ecx, %eax
404	jnz	L(last_vec_x3)
405
406	/* if remaining length <= VEC_SIZE * 3 (Note this is after
407	   remaining length was found to be > VEC_SIZE * 2.  */
408	subl	$VEC_SIZE, %edx
409	jbe	L(zero_end2)
410
411	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
412	vpmovmskb %ymm1, %eax
413	/* Shift remaining length mask for last VEC.  */
414	shrq	$32, %rcx
415	andl	%ecx, %eax
416	jz	L(zero_end2)
417	tzcntl	%eax, %eax
418	addq	$(VEC_SIZE * 3 + 1), %rdi
419	addq	%rdi, %rax
420L(zero_end2):
421	VZEROUPPER_RETURN
422
423	.p2align 4
424L(last_vec_x3):
425	tzcntl	%eax, %eax
426	subq	$-(VEC_SIZE * 2 + 1), %rdi
427	addq	%rdi, %rax
428	VZEROUPPER_RETURN
429# endif
430
431END (MEMCHR)
432#endif
433