1/* wcsrchr with SSE2, without using bsf instructions.
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21# define CFI_PUSH(REG)	\
22	cfi_adjust_cfa_offset (4);	\
23	cfi_rel_offset (REG, 0)
24
25# define CFI_POP(REG)	\
26	cfi_adjust_cfa_offset (-4);	\
27	cfi_restore (REG)
28
29# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
30# define POP(REG)	popl REG; CFI_POP (REG)
31
32# define PARMS	8
33# define ENTRANCE	PUSH (%edi);
34# define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
35# define STR1	PARMS
36# define STR2	STR1+4
37
38	atom_text_section
39ENTRY (__wcsrchr_sse2)
40
41	ENTRANCE
42	mov	STR1(%esp), %ecx
43	movd	STR2(%esp), %xmm1
44
45	mov	%ecx, %edi
46	punpckldq %xmm1, %xmm1
47	pxor	%xmm2, %xmm2
48	punpckldq %xmm1, %xmm1
49
50/* ECX has OFFSET. */
51	and	$63, %ecx
52	cmp	$48, %ecx
53	ja	L(crosscache)
54
55/* unaligned string. */
56	movdqu	(%edi), %xmm0
57	pcmpeqd	%xmm0, %xmm2
58	pcmpeqd	%xmm1, %xmm0
59/* Find where NULL is.  */
60	pmovmskb %xmm2, %ecx
61/* Check if there is a match.  */
62	pmovmskb %xmm0, %eax
63	add	$16, %edi
64
65	test	%eax, %eax
66	jnz	L(unaligned_match1)
67
68	test	%ecx, %ecx
69	jnz	L(return_null)
70
71	and	$-16, %edi
72
73	PUSH	(%esi)
74
75	xor	%edx, %edx
76	jmp	L(loop)
77
78	CFI_POP	(%esi)
79
80	.p2align 4
81L(unaligned_match1):
82	test	%ecx, %ecx
83	jnz	L(prolog_find_zero_1)
84
85	PUSH	(%esi)
86
87/* Save current match */
88	mov	%eax, %edx
89	mov	%edi, %esi
90	and	$-16, %edi
91	jmp	L(loop)
92
93	CFI_POP	(%esi)
94
95	.p2align 4
96L(crosscache):
97/* Hancle unaligned string.  */
98	and	$15, %ecx
99	and	$-16, %edi
100	pxor	%xmm3, %xmm3
101	movdqa	(%edi), %xmm0
102	pcmpeqd	%xmm0, %xmm3
103	pcmpeqd	%xmm1, %xmm0
104/* Find where NULL is.  */
105	pmovmskb %xmm3, %edx
106/* Check if there is a match.  */
107	pmovmskb %xmm0, %eax
108/* Remove the leading bytes.  */
109	shr	%cl, %edx
110	shr	%cl, %eax
111	add	$16, %edi
112
113	test	%eax, %eax
114	jnz	L(unaligned_match)
115
116	test	%edx, %edx
117	jnz	L(return_null)
118
119	PUSH	(%esi)
120
121	xor	%edx, %edx
122	jmp	L(loop)
123
124	CFI_POP	(%esi)
125
126	.p2align 4
127L(unaligned_match):
128	test	%edx, %edx
129	jnz	L(prolog_find_zero)
130
131	PUSH	(%esi)
132
133	mov	%eax, %edx
134	lea	(%edi, %ecx), %esi
135
136/* Loop start on aligned string.  */
137	.p2align 4
138L(loop):
139	movdqa	(%edi), %xmm0
140	pcmpeqd	%xmm0, %xmm2
141	add	$16, %edi
142	pcmpeqd	%xmm1, %xmm0
143	pmovmskb %xmm2, %ecx
144	pmovmskb %xmm0, %eax
145	or	%eax, %ecx
146	jnz	L(matches)
147
148	movdqa	(%edi), %xmm3
149	pcmpeqd	%xmm3, %xmm2
150	add	$16, %edi
151	pcmpeqd	%xmm1, %xmm3
152	pmovmskb %xmm2, %ecx
153	pmovmskb %xmm3, %eax
154	or	%eax, %ecx
155	jnz	L(matches)
156
157	movdqa	(%edi), %xmm4
158	pcmpeqd	%xmm4, %xmm2
159	add	$16, %edi
160	pcmpeqd	%xmm1, %xmm4
161	pmovmskb %xmm2, %ecx
162	pmovmskb %xmm4, %eax
163	or	%eax, %ecx
164	jnz	L(matches)
165
166	movdqa	(%edi), %xmm5
167	pcmpeqd	%xmm5, %xmm2
168	add	$16, %edi
169	pcmpeqd	%xmm1, %xmm5
170	pmovmskb %xmm2, %ecx
171	pmovmskb %xmm5, %eax
172	or	%eax, %ecx
173	jz	L(loop)
174
175	.p2align 4
176L(matches):
177	test	%eax, %eax
178	jnz	L(match)
179L(return_value):
180	test	%edx, %edx
181	jz	L(return_null_1)
182	mov	%edx, %eax
183	mov	%esi, %edi
184
185	POP	(%esi)
186
187	test	%ah, %ah
188	jnz	L(match_third_or_fourth_wchar)
189	test	$15 << 4, %al
190	jnz	L(match_second_wchar)
191	lea	-16(%edi), %eax
192	RETURN
193
194	CFI_PUSH	(%esi)
195
196	.p2align 4
197L(return_null_1):
198	POP	(%esi)
199
200	xor	%eax, %eax
201	RETURN
202
203	CFI_PUSH	(%esi)
204
205	.p2align 4
206L(match):
207	pmovmskb %xmm2, %ecx
208	test	%ecx, %ecx
209	jnz	L(find_zero)
210/* save match info */
211	mov	%eax, %edx
212	mov	%edi, %esi
213	jmp	L(loop)
214
215	.p2align 4
216L(find_zero):
217	test	%cl, %cl
218	jz	L(find_zero_in_third_or_fourth_wchar)
219	test	$15, %cl
220	jz	L(find_zero_in_second_wchar)
221	and	$1, %eax
222	jz	L(return_value)
223
224	POP	(%esi)
225
226	lea	-16(%edi), %eax
227	RETURN
228
229	CFI_PUSH	(%esi)
230
231	.p2align 4
232L(find_zero_in_second_wchar):
233	and	$1 << 5 - 1, %eax
234	jz	L(return_value)
235
236	POP	(%esi)
237
238	test	$15 << 4, %al
239	jnz	L(match_second_wchar)
240	lea	-16(%edi), %eax
241	RETURN
242
243	CFI_PUSH	(%esi)
244
245	.p2align 4
246L(find_zero_in_third_or_fourth_wchar):
247	test	$15, %ch
248	jz	L(find_zero_in_fourth_wchar)
249	and	$1 << 9 - 1, %eax
250	jz	L(return_value)
251
252	POP	(%esi)
253
254	test	%ah, %ah
255	jnz	L(match_third_wchar)
256	test	$15 << 4, %al
257	jnz	L(match_second_wchar)
258	lea	-16(%edi), %eax
259	RETURN
260
261	CFI_PUSH	(%esi)
262
263	.p2align 4
264L(find_zero_in_fourth_wchar):
265
266	POP	(%esi)
267
268	test	%ah, %ah
269	jnz	L(match_third_or_fourth_wchar)
270	test	$15 << 4, %al
271	jnz	L(match_second_wchar)
272	lea	-16(%edi), %eax
273	RETURN
274
275	CFI_PUSH	(%esi)
276
277	.p2align 4
278L(match_second_wchar):
279	lea	-12(%edi), %eax
280	RETURN
281
282	.p2align 4
283L(match_third_or_fourth_wchar):
284	test	$15 << 4, %ah
285	jnz	L(match_fourth_wchar)
286	lea	-8(%edi), %eax
287	RETURN
288
289	.p2align 4
290L(match_third_wchar):
291	lea	-8(%edi), %eax
292	RETURN
293
294	.p2align 4
295L(match_fourth_wchar):
296	lea	-4(%edi), %eax
297	RETURN
298
299	.p2align 4
300L(return_null):
301	xor	%eax, %eax
302	RETURN
303
304	.p2align 4
305L(prolog_find_zero):
306	add	%ecx, %edi
307	mov     %edx, %ecx
308L(prolog_find_zero_1):
309	test	%cl, %cl
310	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
311	test	$15, %cl
312	jz	L(prolog_find_zero_in_second_wchar)
313	and	$1, %eax
314	jz	L(return_null)
315
316	lea	-16(%edi), %eax
317	RETURN
318
319	.p2align 4
320L(prolog_find_zero_in_second_wchar):
321	and	$1 << 5 - 1, %eax
322	jz	L(return_null)
323
324	test	$15 << 4, %al
325	jnz	L(match_second_wchar)
326	lea	-16(%edi), %eax
327	RETURN
328
329	.p2align 4
330L(prolog_find_zero_in_third_or_fourth_wchar):
331	test	$15, %ch
332	jz	L(prolog_find_zero_in_fourth_wchar)
333	and	$1 << 9 - 1, %eax
334	jz	L(return_null)
335
336	test	%ah, %ah
337	jnz	L(match_third_wchar)
338	test	$15 << 4, %al
339	jnz	L(match_second_wchar)
340	lea	-16(%edi), %eax
341	RETURN
342
343	.p2align 4
344L(prolog_find_zero_in_fourth_wchar):
345	test	%ah, %ah
346	jnz	L(match_third_or_fourth_wchar)
347	test	$15 << 4, %al
348	jnz	L(match_second_wchar)
349	lea	-16(%edi), %eax
350	RETURN
351
352END (__wcsrchr_sse2)
353#endif
354