1/* strcpy with SSE2 and unaligned load
2   Copyright (C) 2011-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# ifndef USE_AS_STRCAT
22#  include <sysdep.h>
23
24#  ifndef STRCPY
25#   define STRCPY  __strcpy_sse2_unaligned
26#  endif
27
28# endif
29
30# define JMPTBL(I, B)	I - B
31# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)             \
32	lea	TABLE(%rip), %r11;                              \
33	movslq	(%r11, INDEX, SCALE), %rcx;                     \
34	lea	(%r11, %rcx), %rcx;                             \
35	_CET_NOTRACK jmp *%rcx
36
37# ifndef USE_AS_STRCAT
38
39.text
40ENTRY (STRCPY)
41#  ifdef USE_AS_STRNCPY
42	mov	%RDX_LP, %R8_LP
43	test	%R8_LP, %R8_LP
44	jz	L(ExitZero)
45#  endif
46	mov	%rsi, %rcx
47#  ifndef USE_AS_STPCPY
48	mov	%rdi, %rax      /* save result */
49#  endif
50
51# endif
52
53	and	$63, %rcx
54	cmp	$32, %rcx
55	jbe	L(SourceStringAlignmentLess32)
56
57	and	$-16, %rsi
58	and	$15, %rcx
59	pxor	%xmm0, %xmm0
60	pxor	%xmm1, %xmm1
61
62	pcmpeqb	(%rsi), %xmm1
63	pmovmskb %xmm1, %rdx
64	shr	%cl, %rdx
65
66# ifdef USE_AS_STRNCPY
67#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
68	mov	$16, %r10
69	sub	%rcx, %r10
70	cmp	%r10, %r8
71#  else
72	mov	$17, %r10
73	sub	%rcx, %r10
74	cmp	%r10, %r8
75#  endif
76	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
77# endif
78	test	%rdx, %rdx
79	jnz	L(CopyFrom1To16BytesTail)
80
81	pcmpeqb	16(%rsi), %xmm0
82	pmovmskb %xmm0, %rdx
83
84# ifdef USE_AS_STRNCPY
85	add	$16, %r10
86	cmp	%r10, %r8
87	jbe	L(CopyFrom1To32BytesCase2OrCase3)
88# endif
89	test	%rdx, %rdx
90	jnz	L(CopyFrom1To32Bytes)
91
92	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
93	movdqu	%xmm1, (%rdi)
94
95/* If source address alignment != destination address alignment */
96	.p2align 4
97L(Unalign16Both):
98	sub	%rcx, %rdi
99# ifdef USE_AS_STRNCPY
100	add	%rcx, %r8
101	sbb	%rcx, %rcx
102	or	%rcx, %r8
103# endif
104	mov	$16, %rcx
105	movdqa	(%rsi, %rcx), %xmm1
106	movaps	16(%rsi, %rcx), %xmm2
107	movdqu	%xmm1, (%rdi, %rcx)
108	pcmpeqb	%xmm2, %xmm0
109	pmovmskb %xmm0, %rdx
110	add	$16, %rcx
111# ifdef USE_AS_STRNCPY
112	sub	$48, %r8
113	jbe	L(CopyFrom1To16BytesCase2OrCase3)
114# endif
115	test	%rdx, %rdx
116# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
117	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
118# else
119	jnz	L(CopyFrom1To16Bytes)
120# endif
121
122	movaps	16(%rsi, %rcx), %xmm3
123	movdqu	%xmm2, (%rdi, %rcx)
124	pcmpeqb	%xmm3, %xmm0
125	pmovmskb %xmm0, %rdx
126	add	$16, %rcx
127# ifdef USE_AS_STRNCPY
128	sub	$16, %r8
129	jbe	L(CopyFrom1To16BytesCase2OrCase3)
130# endif
131	test	%rdx, %rdx
132# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
133	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
134# else
135	jnz	L(CopyFrom1To16Bytes)
136# endif
137
138	movaps	16(%rsi, %rcx), %xmm4
139	movdqu	%xmm3, (%rdi, %rcx)
140	pcmpeqb	%xmm4, %xmm0
141	pmovmskb %xmm0, %rdx
142	add	$16, %rcx
143# ifdef USE_AS_STRNCPY
144	sub	$16, %r8
145	jbe	L(CopyFrom1To16BytesCase2OrCase3)
146# endif
147	test	%rdx, %rdx
148# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
149	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
150# else
151	jnz	L(CopyFrom1To16Bytes)
152# endif
153
154	movaps	16(%rsi, %rcx), %xmm1
155	movdqu	%xmm4, (%rdi, %rcx)
156	pcmpeqb	%xmm1, %xmm0
157	pmovmskb %xmm0, %rdx
158	add	$16, %rcx
159# ifdef USE_AS_STRNCPY
160	sub	$16, %r8
161	jbe	L(CopyFrom1To16BytesCase2OrCase3)
162# endif
163	test	%rdx, %rdx
164# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
165	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
166# else
167	jnz	L(CopyFrom1To16Bytes)
168# endif
169
170	movaps	16(%rsi, %rcx), %xmm2
171	movdqu	%xmm1, (%rdi, %rcx)
172	pcmpeqb	%xmm2, %xmm0
173	pmovmskb %xmm0, %rdx
174	add	$16, %rcx
175# ifdef USE_AS_STRNCPY
176	sub	$16, %r8
177	jbe	L(CopyFrom1To16BytesCase2OrCase3)
178# endif
179	test	%rdx, %rdx
180# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
181	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
182# else
183	jnz	L(CopyFrom1To16Bytes)
184# endif
185
186	movaps	16(%rsi, %rcx), %xmm3
187	movdqu	%xmm2, (%rdi, %rcx)
188	pcmpeqb	%xmm3, %xmm0
189	pmovmskb %xmm0, %rdx
190	add	$16, %rcx
191# ifdef USE_AS_STRNCPY
192	sub	$16, %r8
193	jbe	L(CopyFrom1To16BytesCase2OrCase3)
194# endif
195	test	%rdx, %rdx
196# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
197	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
198# else
199	jnz	L(CopyFrom1To16Bytes)
200# endif
201
202	movdqu	%xmm3, (%rdi, %rcx)
203	mov	%rsi, %rdx
204	lea	16(%rsi, %rcx), %rsi
205	and	$-0x40, %rsi
206	sub	%rsi, %rdx
207	sub	%rdx, %rdi
208# ifdef USE_AS_STRNCPY
209	lea	128(%r8, %rdx), %r8
210# endif
211L(Unaligned64Loop):
212	movaps	(%rsi), %xmm2
213	movaps	%xmm2, %xmm4
214	movaps	16(%rsi), %xmm5
215	movaps	32(%rsi), %xmm3
216	movaps	%xmm3, %xmm6
217	movaps	48(%rsi), %xmm7
218	pminub	%xmm5, %xmm2
219	pminub	%xmm7, %xmm3
220	pminub	%xmm2, %xmm3
221	pcmpeqb	%xmm0, %xmm3
222	pmovmskb %xmm3, %rdx
223# ifdef USE_AS_STRNCPY
224	sub	$64, %r8
225	jbe	L(UnalignedLeaveCase2OrCase3)
226# endif
227	test	%rdx, %rdx
228	jnz	L(Unaligned64Leave)
229
230L(Unaligned64Loop_start):
231	add	$64, %rdi
232	add	$64, %rsi
233	movdqu	%xmm4, -64(%rdi)
234	movaps	(%rsi), %xmm2
235	movdqa	%xmm2, %xmm4
236	movdqu	%xmm5, -48(%rdi)
237	movaps	16(%rsi), %xmm5
238	pminub	%xmm5, %xmm2
239	movaps	32(%rsi), %xmm3
240	movdqu	%xmm6, -32(%rdi)
241	movaps	%xmm3, %xmm6
242	movdqu	%xmm7, -16(%rdi)
243	movaps	48(%rsi), %xmm7
244	pminub	%xmm7, %xmm3
245	pminub	%xmm2, %xmm3
246	pcmpeqb	%xmm0, %xmm3
247	pmovmskb %xmm3, %rdx
248# ifdef USE_AS_STRNCPY
249	sub	$64, %r8
250	jbe	L(UnalignedLeaveCase2OrCase3)
251# endif
252	test	%rdx, %rdx
253	jz	L(Unaligned64Loop_start)
254
255L(Unaligned64Leave):
256	pxor	%xmm1, %xmm1
257
258	pcmpeqb	%xmm4, %xmm0
259	pcmpeqb	%xmm5, %xmm1
260	pmovmskb %xmm0, %rdx
261	pmovmskb %xmm1, %rcx
262	test	%rdx, %rdx
263	jnz	L(CopyFrom1To16BytesUnaligned_0)
264	test	%rcx, %rcx
265	jnz	L(CopyFrom1To16BytesUnaligned_16)
266
267	pcmpeqb	%xmm6, %xmm0
268	pcmpeqb	%xmm7, %xmm1
269	pmovmskb %xmm0, %rdx
270	pmovmskb %xmm1, %rcx
271	test	%rdx, %rdx
272	jnz	L(CopyFrom1To16BytesUnaligned_32)
273
274	bsf	%rcx, %rdx
275	movdqu	%xmm4, (%rdi)
276	movdqu	%xmm5, 16(%rdi)
277	movdqu	%xmm6, 32(%rdi)
278# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
279# ifdef USE_AS_STPCPY
280	lea	48(%rdi, %rdx), %rax
281# endif
282	movdqu	%xmm7, 48(%rdi)
283	add	$15, %r8
284	sub	%rdx, %r8
285	lea	49(%rdi, %rdx), %rdi
286	jmp	L(StrncpyFillTailWithZero)
287# else
288	add	$48, %rsi
289	add	$48, %rdi
290	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
291# endif
292
293/* If source address alignment == destination address alignment */
294
295L(SourceStringAlignmentLess32):
296	pxor	%xmm0, %xmm0
297	movdqu	(%rsi), %xmm1
298	movdqu	16(%rsi), %xmm2
299	pcmpeqb	%xmm1, %xmm0
300	pmovmskb %xmm0, %rdx
301
302# ifdef USE_AS_STRNCPY
303#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
304	cmp	$16, %r8
305#  else
306	cmp	$17, %r8
307#  endif
308	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
309# endif
310	test	%rdx, %rdx
311	jnz	L(CopyFrom1To16BytesTail1)
312
313	pcmpeqb	%xmm2, %xmm0
314	movdqu	%xmm1, (%rdi)
315	pmovmskb %xmm0, %rdx
316
317# ifdef USE_AS_STRNCPY
318#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
319	cmp	$32, %r8
320#  else
321	cmp	$33, %r8
322#  endif
323	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
324# endif
325	test	%rdx, %rdx
326	jnz	L(CopyFrom1To32Bytes1)
327
328	and	$-16, %rsi
329	and	$15, %rcx
330	jmp	L(Unalign16Both)
331
332/*------End of main part with loops---------------------*/
333
334/* Case1 */
335
336# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
337	.p2align 4
338L(CopyFrom1To16Bytes):
339	add	%rcx, %rdi
340	add	%rcx, %rsi
341	bsf	%rdx, %rdx
342	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
343# endif
344	.p2align 4
345L(CopyFrom1To16BytesTail):
346	add	%rcx, %rsi
347	bsf	%rdx, %rdx
348	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
349
350	.p2align 4
351L(CopyFrom1To32Bytes1):
352	add	$16, %rsi
353	add	$16, %rdi
354# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
355	sub	$16, %r8
356# endif
357L(CopyFrom1To16BytesTail1):
358	bsf	%rdx, %rdx
359	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
360
361	.p2align 4
362L(CopyFrom1To32Bytes):
363	bsf	%rdx, %rdx
364	add	%rcx, %rsi
365	add	$16, %rdx
366	sub	%rcx, %rdx
367	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
368
369	.p2align 4
370L(CopyFrom1To16BytesUnaligned_0):
371	bsf	%rdx, %rdx
372# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
373# ifdef USE_AS_STPCPY
374	lea	(%rdi, %rdx), %rax
375# endif
376	movdqu	%xmm4, (%rdi)
377	add	$63, %r8
378	sub	%rdx, %r8
379	lea	1(%rdi, %rdx), %rdi
380	jmp	L(StrncpyFillTailWithZero)
381# else
382	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
383# endif
384
385	.p2align 4
386L(CopyFrom1To16BytesUnaligned_16):
387	bsf	%rcx, %rdx
388	movdqu	%xmm4, (%rdi)
389# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
390# ifdef USE_AS_STPCPY
391	lea	16(%rdi, %rdx), %rax
392# endif
393	movdqu	%xmm5, 16(%rdi)
394	add	$47, %r8
395	sub	%rdx, %r8
396	lea	17(%rdi, %rdx), %rdi
397	jmp	L(StrncpyFillTailWithZero)
398# else
399	add	$16, %rsi
400	add	$16, %rdi
401	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
402# endif
403
404	.p2align 4
405L(CopyFrom1To16BytesUnaligned_32):
406	bsf	%rdx, %rdx
407	movdqu	%xmm4, (%rdi)
408	movdqu	%xmm5, 16(%rdi)
409# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
410# ifdef USE_AS_STPCPY
411	lea	32(%rdi, %rdx), %rax
412# endif
413	movdqu	%xmm6, 32(%rdi)
414	add	$31, %r8
415	sub	%rdx, %r8
416	lea	33(%rdi, %rdx), %rdi
417	jmp	L(StrncpyFillTailWithZero)
418# else
419	add	$32, %rsi
420	add	$32, %rdi
421	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
422# endif
423
424# ifdef USE_AS_STRNCPY
425#  ifndef USE_AS_STRCAT
426	.p2align 4
427L(CopyFrom1To16BytesUnalignedXmm6):
428	movdqu	%xmm6, (%rdi, %rcx)
429	jmp	L(CopyFrom1To16BytesXmmExit)
430
431	.p2align 4
432L(CopyFrom1To16BytesUnalignedXmm5):
433	movdqu	%xmm5, (%rdi, %rcx)
434	jmp	L(CopyFrom1To16BytesXmmExit)
435
436	.p2align 4
437L(CopyFrom1To16BytesUnalignedXmm4):
438	movdqu	%xmm4, (%rdi, %rcx)
439	jmp	L(CopyFrom1To16BytesXmmExit)
440
441	.p2align 4
442L(CopyFrom1To16BytesUnalignedXmm3):
443	movdqu	%xmm3, (%rdi, %rcx)
444	jmp	L(CopyFrom1To16BytesXmmExit)
445
446	.p2align 4
447L(CopyFrom1To16BytesUnalignedXmm1):
448	movdqu	%xmm1, (%rdi, %rcx)
449	jmp	L(CopyFrom1To16BytesXmmExit)
450#  endif
451
452	.p2align 4
453L(CopyFrom1To16BytesExit):
454	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
455
456/* Case2 */
457
458	.p2align 4
459L(CopyFrom1To16BytesCase2):
460	add	$16, %r8
461	add	%rcx, %rdi
462	add	%rcx, %rsi
463	bsf	%rdx, %rdx
464	cmp	%r8, %rdx
465	jb	L(CopyFrom1To16BytesExit)
466	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
467
468	.p2align 4
469L(CopyFrom1To32BytesCase2):
470	add	%rcx, %rsi
471	bsf	%rdx, %rdx
472	add	$16, %rdx
473	sub	%rcx, %rdx
474	cmp	%r8, %rdx
475	jb	L(CopyFrom1To16BytesExit)
476	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
477
478L(CopyFrom1To16BytesTailCase2):
479	add	%rcx, %rsi
480	bsf	%rdx, %rdx
481	cmp	%r8, %rdx
482	jb	L(CopyFrom1To16BytesExit)
483	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
484
485L(CopyFrom1To16BytesTail1Case2):
486	bsf	%rdx, %rdx
487	cmp	%r8, %rdx
488	jb	L(CopyFrom1To16BytesExit)
489	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
490
491/* Case2 or Case3,  Case3 */
492
493	.p2align 4
494L(CopyFrom1To16BytesCase2OrCase3):
495	test	%rdx, %rdx
496	jnz	L(CopyFrom1To16BytesCase2)
497L(CopyFrom1To16BytesCase3):
498	add	$16, %r8
499	add	%rcx, %rdi
500	add	%rcx, %rsi
501	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
502
503	.p2align 4
504L(CopyFrom1To32BytesCase2OrCase3):
505	test	%rdx, %rdx
506	jnz	L(CopyFrom1To32BytesCase2)
507	add	%rcx, %rsi
508	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
509
510	.p2align 4
511L(CopyFrom1To16BytesTailCase2OrCase3):
512	test	%rdx, %rdx
513	jnz	L(CopyFrom1To16BytesTailCase2)
514	add	%rcx, %rsi
515	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
516
517	.p2align 4
518L(CopyFrom1To32Bytes1Case2OrCase3):
519	add	$16, %rdi
520	add	$16, %rsi
521	sub	$16, %r8
522L(CopyFrom1To16BytesTail1Case2OrCase3):
523	test	%rdx, %rdx
524	jnz	L(CopyFrom1To16BytesTail1Case2)
525	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
526
527# endif
528
529/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
530
531	.p2align 4
532L(Exit1):
533	mov	%dh, (%rdi)
534# ifdef USE_AS_STPCPY
535	lea	(%rdi), %rax
536# endif
537# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
538	sub	$1, %r8
539	lea	1(%rdi), %rdi
540	jnz	L(StrncpyFillTailWithZero)
541# endif
542	ret
543
544	.p2align 4
545L(Exit2):
546	mov	(%rsi), %dx
547	mov	%dx, (%rdi)
548# ifdef USE_AS_STPCPY
549	lea	1(%rdi), %rax
550# endif
551# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
552	sub	$2, %r8
553	lea	2(%rdi), %rdi
554	jnz	L(StrncpyFillTailWithZero)
555# endif
556	ret
557
558	.p2align 4
559L(Exit3):
560	mov	(%rsi), %cx
561	mov	%cx, (%rdi)
562	mov	%dh, 2(%rdi)
563# ifdef USE_AS_STPCPY
564	lea	2(%rdi), %rax
565# endif
566# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
567	sub	$3, %r8
568	lea	3(%rdi), %rdi
569	jnz	L(StrncpyFillTailWithZero)
570# endif
571	ret
572
573	.p2align 4
574L(Exit4):
575	mov	(%rsi), %edx
576	mov	%edx, (%rdi)
577# ifdef USE_AS_STPCPY
578	lea	3(%rdi), %rax
579# endif
580# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
581	sub	$4, %r8
582	lea	4(%rdi), %rdi
583	jnz	L(StrncpyFillTailWithZero)
584# endif
585	ret
586
587	.p2align 4
588L(Exit5):
589	mov	(%rsi), %ecx
590	mov	%dh, 4(%rdi)
591	mov	%ecx, (%rdi)
592# ifdef USE_AS_STPCPY
593	lea	4(%rdi), %rax
594# endif
595# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
596	sub	$5, %r8
597	lea	5(%rdi), %rdi
598	jnz	L(StrncpyFillTailWithZero)
599# endif
600	ret
601
602	.p2align 4
603L(Exit6):
604	mov	(%rsi), %ecx
605	mov	4(%rsi), %dx
606	mov	%ecx, (%rdi)
607	mov	%dx, 4(%rdi)
608# ifdef USE_AS_STPCPY
609	lea	5(%rdi), %rax
610# endif
611# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
612	sub	$6, %r8
613	lea	6(%rdi), %rdi
614	jnz	L(StrncpyFillTailWithZero)
615# endif
616	ret
617
618	.p2align 4
619L(Exit7):
620	mov	(%rsi), %ecx
621	mov	3(%rsi), %edx
622	mov	%ecx, (%rdi)
623	mov	%edx, 3(%rdi)
624# ifdef USE_AS_STPCPY
625	lea	6(%rdi), %rax
626# endif
627# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
628	sub	$7, %r8
629	lea	7(%rdi), %rdi
630	jnz	L(StrncpyFillTailWithZero)
631# endif
632	ret
633
634	.p2align 4
635L(Exit8):
636	mov	(%rsi), %rdx
637	mov	%rdx, (%rdi)
638# ifdef USE_AS_STPCPY
639	lea	7(%rdi), %rax
640# endif
641# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
642	sub	$8, %r8
643	lea	8(%rdi), %rdi
644	jnz	L(StrncpyFillTailWithZero)
645# endif
646	ret
647
648	.p2align 4
649L(Exit9):
650	mov	(%rsi), %rcx
651	mov	%dh, 8(%rdi)
652	mov	%rcx, (%rdi)
653# ifdef USE_AS_STPCPY
654	lea	8(%rdi), %rax
655# endif
656# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
657	sub	$9, %r8
658	lea	9(%rdi), %rdi
659	jnz	L(StrncpyFillTailWithZero)
660# endif
661	ret
662
663	.p2align 4
664L(Exit10):
665	mov	(%rsi), %rcx
666	mov	8(%rsi), %dx
667	mov	%rcx, (%rdi)
668	mov	%dx, 8(%rdi)
669# ifdef USE_AS_STPCPY
670	lea	9(%rdi), %rax
671# endif
672# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
673	sub	$10, %r8
674	lea	10(%rdi), %rdi
675	jnz	L(StrncpyFillTailWithZero)
676# endif
677	ret
678
679	.p2align 4
680L(Exit11):
681	mov	(%rsi), %rcx
682	mov	7(%rsi), %edx
683	mov	%rcx, (%rdi)
684	mov	%edx, 7(%rdi)
685# ifdef USE_AS_STPCPY
686	lea	10(%rdi), %rax
687# endif
688# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
689	sub	$11, %r8
690	lea	11(%rdi), %rdi
691	jnz	L(StrncpyFillTailWithZero)
692# endif
693	ret
694
695	.p2align 4
696L(Exit12):
697	mov	(%rsi), %rcx
698	mov	8(%rsi), %edx
699	mov	%rcx, (%rdi)
700	mov	%edx, 8(%rdi)
701# ifdef USE_AS_STPCPY
702	lea	11(%rdi), %rax
703# endif
704# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
705	sub	$12, %r8
706	lea	12(%rdi), %rdi
707	jnz	L(StrncpyFillTailWithZero)
708# endif
709	ret
710
711	.p2align 4
712L(Exit13):
713	mov	(%rsi), %rcx
714	mov	5(%rsi), %rdx
715	mov	%rcx, (%rdi)
716	mov	%rdx, 5(%rdi)
717# ifdef USE_AS_STPCPY
718	lea	12(%rdi), %rax
719# endif
720# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
721	sub	$13, %r8
722	lea	13(%rdi), %rdi
723	jnz	L(StrncpyFillTailWithZero)
724# endif
725	ret
726
727	.p2align 4
728L(Exit14):
729	mov	(%rsi), %rcx
730	mov	6(%rsi), %rdx
731	mov	%rcx, (%rdi)
732	mov	%rdx, 6(%rdi)
733# ifdef USE_AS_STPCPY
734	lea	13(%rdi), %rax
735# endif
736# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
737	sub	$14, %r8
738	lea	14(%rdi), %rdi
739	jnz	L(StrncpyFillTailWithZero)
740# endif
741	ret
742
743	.p2align 4
744L(Exit15):
745	mov	(%rsi), %rcx
746	mov	7(%rsi), %rdx
747	mov	%rcx, (%rdi)
748	mov	%rdx, 7(%rdi)
749# ifdef USE_AS_STPCPY
750	lea	14(%rdi), %rax
751# endif
752# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
753	sub	$15, %r8
754	lea	15(%rdi), %rdi
755	jnz	L(StrncpyFillTailWithZero)
756# endif
757	ret
758
759	.p2align 4
760L(Exit16):
761	movdqu	(%rsi), %xmm0
762	movdqu	%xmm0, (%rdi)
763# ifdef USE_AS_STPCPY
764	lea	15(%rdi), %rax
765# endif
766# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
767	sub	$16, %r8
768	lea	16(%rdi), %rdi
769	jnz	L(StrncpyFillTailWithZero)
770# endif
771	ret
772
773	.p2align 4
774L(Exit17):
775	movdqu	(%rsi), %xmm0
776	movdqu	%xmm0, (%rdi)
777	mov	%dh, 16(%rdi)
778# ifdef USE_AS_STPCPY
779	lea	16(%rdi), %rax
780# endif
781# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
782	sub	$17, %r8
783	lea	17(%rdi), %rdi
784	jnz	L(StrncpyFillTailWithZero)
785# endif
786	ret
787
788	.p2align 4
789L(Exit18):
790	movdqu	(%rsi), %xmm0
791	mov	16(%rsi), %cx
792	movdqu	%xmm0, (%rdi)
793	mov	%cx, 16(%rdi)
794# ifdef USE_AS_STPCPY
795	lea	17(%rdi), %rax
796# endif
797# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
798	sub	$18, %r8
799	lea	18(%rdi), %rdi
800	jnz	L(StrncpyFillTailWithZero)
801# endif
802	ret
803
804	.p2align 4
805L(Exit19):
806	movdqu	(%rsi), %xmm0
807	mov	15(%rsi), %ecx
808	movdqu	%xmm0, (%rdi)
809	mov	%ecx, 15(%rdi)
810# ifdef USE_AS_STPCPY
811	lea	18(%rdi), %rax
812# endif
813# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
814	sub	$19, %r8
815	lea	19(%rdi), %rdi
816	jnz	L(StrncpyFillTailWithZero)
817# endif
818	ret
819
820	.p2align 4
821L(Exit20):
822	movdqu	(%rsi), %xmm0
823	mov	16(%rsi), %ecx
824	movdqu	%xmm0, (%rdi)
825	mov	%ecx, 16(%rdi)
826# ifdef USE_AS_STPCPY
827	lea	19(%rdi), %rax
828# endif
829# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
830	sub	$20, %r8
831	lea	20(%rdi), %rdi
832	jnz	L(StrncpyFillTailWithZero)
833# endif
834	ret
835
836	.p2align 4
837L(Exit21):
838	movdqu	(%rsi), %xmm0
839	mov	16(%rsi), %ecx
840	movdqu	%xmm0, (%rdi)
841	mov	%ecx, 16(%rdi)
842	mov	%dh, 20(%rdi)
843# ifdef USE_AS_STPCPY
844	lea	20(%rdi), %rax
845# endif
846# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
847	sub	$21, %r8
848	lea	21(%rdi), %rdi
849	jnz	L(StrncpyFillTailWithZero)
850# endif
851	ret
852
853	.p2align 4
854L(Exit22):
855	movdqu	(%rsi), %xmm0
856	mov	14(%rsi), %rcx
857	movdqu	%xmm0, (%rdi)
858	mov	%rcx, 14(%rdi)
859# ifdef USE_AS_STPCPY
860	lea	21(%rdi), %rax
861# endif
862# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
863	sub	$22, %r8
864	lea	22(%rdi), %rdi
865	jnz	L(StrncpyFillTailWithZero)
866# endif
867	ret
868
869	.p2align 4
870L(Exit23):
871	movdqu	(%rsi), %xmm0
872	mov	15(%rsi), %rcx
873	movdqu	%xmm0, (%rdi)
874	mov	%rcx, 15(%rdi)
875# ifdef USE_AS_STPCPY
876	lea	22(%rdi), %rax
877# endif
878# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
879	sub	$23, %r8
880	lea	23(%rdi), %rdi
881	jnz	L(StrncpyFillTailWithZero)
882# endif
883	ret
884
885	.p2align 4
886L(Exit24):
887	movdqu	(%rsi), %xmm0
888	mov	16(%rsi), %rcx
889	movdqu	%xmm0, (%rdi)
890	mov	%rcx, 16(%rdi)
891# ifdef USE_AS_STPCPY
892	lea	23(%rdi), %rax
893# endif
894# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
895	sub	$24, %r8
896	lea	24(%rdi), %rdi
897	jnz	L(StrncpyFillTailWithZero)
898# endif
899	ret
900
901	.p2align 4
902L(Exit25):
903	movdqu	(%rsi), %xmm0
904	mov	16(%rsi), %rcx
905	movdqu	%xmm0, (%rdi)
906	mov	%rcx, 16(%rdi)
907	mov	%dh, 24(%rdi)
908# ifdef USE_AS_STPCPY
909	lea	24(%rdi), %rax
910# endif
911# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
912	sub	$25, %r8
913	lea	25(%rdi), %rdi
914	jnz	L(StrncpyFillTailWithZero)
915# endif
916	ret
917
918	.p2align 4
919L(Exit26):
920	movdqu	(%rsi), %xmm0
921	mov	16(%rsi), %rdx
922	mov	24(%rsi), %cx
923	movdqu	%xmm0, (%rdi)
924	mov	%rdx, 16(%rdi)
925	mov	%cx, 24(%rdi)
926# ifdef USE_AS_STPCPY
927	lea	25(%rdi), %rax
928# endif
929# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
930	sub	$26, %r8
931	lea	26(%rdi), %rdi
932	jnz	L(StrncpyFillTailWithZero)
933# endif
934	ret
935
936	.p2align 4
937L(Exit27):
938	movdqu	(%rsi), %xmm0
939	mov	16(%rsi), %rdx
940	mov	23(%rsi), %ecx
941	movdqu	%xmm0, (%rdi)
942	mov	%rdx, 16(%rdi)
943	mov	%ecx, 23(%rdi)
944# ifdef USE_AS_STPCPY
945	lea	26(%rdi), %rax
946# endif
947# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
948	sub	$27, %r8
949	lea	27(%rdi), %rdi
950	jnz	L(StrncpyFillTailWithZero)
951# endif
952	ret
953
954	.p2align 4
955L(Exit28):
956	movdqu	(%rsi), %xmm0
957	mov	16(%rsi), %rdx
958	mov	24(%rsi), %ecx
959	movdqu	%xmm0, (%rdi)
960	mov	%rdx, 16(%rdi)
961	mov	%ecx, 24(%rdi)
962# ifdef USE_AS_STPCPY
963	lea	27(%rdi), %rax
964# endif
965# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
966	sub	$28, %r8
967	lea	28(%rdi), %rdi
968	jnz	L(StrncpyFillTailWithZero)
969# endif
970	ret
971
972	.p2align 4
973L(Exit29):
974	movdqu	(%rsi), %xmm0
975	movdqu	13(%rsi), %xmm2
976	movdqu	%xmm0, (%rdi)
977	movdqu	%xmm2, 13(%rdi)
978# ifdef USE_AS_STPCPY
979	lea	28(%rdi), %rax
980# endif
981# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
982	sub	$29, %r8
983	lea	29(%rdi), %rdi
984	jnz	L(StrncpyFillTailWithZero)
985# endif
986	ret
987
988	.p2align 4
989L(Exit30):
990	movdqu	(%rsi), %xmm0
991	movdqu	14(%rsi), %xmm2
992	movdqu	%xmm0, (%rdi)
993	movdqu	%xmm2, 14(%rdi)
994# ifdef USE_AS_STPCPY
995	lea	29(%rdi), %rax
996# endif
997# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
998	sub	$30, %r8
999	lea	30(%rdi), %rdi
1000	jnz	L(StrncpyFillTailWithZero)
1001# endif
1002	ret
1003
1004	.p2align 4
1005L(Exit31):
1006	movdqu	(%rsi), %xmm0
1007	movdqu	15(%rsi), %xmm2
1008	movdqu	%xmm0, (%rdi)
1009	movdqu	%xmm2, 15(%rdi)
1010# ifdef USE_AS_STPCPY
1011	lea	30(%rdi), %rax
1012# endif
1013# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1014	sub	$31, %r8
1015	lea	31(%rdi), %rdi
1016	jnz	L(StrncpyFillTailWithZero)
1017# endif
1018	ret
1019
1020	.p2align 4
1021L(Exit32):
1022	movdqu	(%rsi), %xmm0
1023	movdqu	16(%rsi), %xmm2
1024	movdqu	%xmm0, (%rdi)
1025	movdqu	%xmm2, 16(%rdi)
1026# ifdef USE_AS_STPCPY
1027	lea	31(%rdi), %rax
1028# endif
1029# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1030	sub	$32, %r8
1031	lea	32(%rdi), %rdi
1032	jnz	L(StrncpyFillTailWithZero)
1033# endif
1034	ret
1035
1036# ifdef USE_AS_STRNCPY
1037
1038	.p2align 4
1039L(StrncpyExit0):
1040#  ifdef USE_AS_STPCPY
1041	mov	%rdi, %rax
1042#  endif
1043#  ifdef USE_AS_STRCAT
1044	xor	%ch, %ch
1045	movb	%ch, (%rdi)
1046#  endif
1047	ret
1048
1049	.p2align 4
1050L(StrncpyExit1):
1051	mov	(%rsi), %dl
1052	mov	%dl, (%rdi)
1053#  ifdef USE_AS_STPCPY
1054	lea	1(%rdi), %rax
1055#  endif
1056#  ifdef USE_AS_STRCAT
1057	xor	%ch, %ch
1058	movb	%ch, 1(%rdi)
1059#  endif
1060	ret
1061
1062	.p2align 4
1063L(StrncpyExit2):
1064	mov	(%rsi), %dx
1065	mov	%dx, (%rdi)
1066#  ifdef USE_AS_STPCPY
1067	lea	2(%rdi), %rax
1068#  endif
1069#  ifdef USE_AS_STRCAT
1070	xor	%ch, %ch
1071	movb	%ch, 2(%rdi)
1072#  endif
1073	ret
1074
1075	.p2align 4
1076L(StrncpyExit3):
1077	mov	(%rsi), %cx
1078	mov	2(%rsi), %dl
1079	mov	%cx, (%rdi)
1080	mov	%dl, 2(%rdi)
1081#  ifdef USE_AS_STPCPY
1082	lea	3(%rdi), %rax
1083#  endif
1084#  ifdef USE_AS_STRCAT
1085	xor	%ch, %ch
1086	movb	%ch, 3(%rdi)
1087#  endif
1088	ret
1089
1090	.p2align 4
1091L(StrncpyExit4):
1092	mov	(%rsi), %edx
1093	mov	%edx, (%rdi)
1094#  ifdef USE_AS_STPCPY
1095	lea	4(%rdi), %rax
1096#  endif
1097#  ifdef USE_AS_STRCAT
1098	xor	%ch, %ch
1099	movb	%ch, 4(%rdi)
1100#  endif
1101	ret
1102
1103	.p2align 4
1104L(StrncpyExit5):
1105	mov	(%rsi), %ecx
1106	mov	4(%rsi), %dl
1107	mov	%ecx, (%rdi)
1108	mov	%dl, 4(%rdi)
1109#  ifdef USE_AS_STPCPY
1110	lea	5(%rdi), %rax
1111#  endif
1112#  ifdef USE_AS_STRCAT
1113	xor	%ch, %ch
1114	movb	%ch, 5(%rdi)
1115#  endif
1116	ret
1117
1118	.p2align 4
1119L(StrncpyExit6):
1120	mov	(%rsi), %ecx
1121	mov	4(%rsi), %dx
1122	mov	%ecx, (%rdi)
1123	mov	%dx, 4(%rdi)
1124#  ifdef USE_AS_STPCPY
1125	lea	6(%rdi), %rax
1126#  endif
1127#  ifdef USE_AS_STRCAT
1128	xor	%ch, %ch
1129	movb	%ch, 6(%rdi)
1130#  endif
1131	ret
1132
1133	.p2align 4
1134L(StrncpyExit7):
1135	mov	(%rsi), %ecx
1136	mov	3(%rsi), %edx
1137	mov	%ecx, (%rdi)
1138	mov	%edx, 3(%rdi)
1139#  ifdef USE_AS_STPCPY
1140	lea	7(%rdi), %rax
1141#  endif
1142#  ifdef USE_AS_STRCAT
1143	xor	%ch, %ch
1144	movb	%ch, 7(%rdi)
1145#  endif
1146	ret
1147
1148	.p2align 4
1149L(StrncpyExit8):
1150	mov	(%rsi), %rdx
1151	mov	%rdx, (%rdi)
1152#  ifdef USE_AS_STPCPY
1153	lea	8(%rdi), %rax
1154#  endif
1155#  ifdef USE_AS_STRCAT
1156	xor	%ch, %ch
1157	movb	%ch, 8(%rdi)
1158#  endif
1159	ret
1160
1161	.p2align 4
1162L(StrncpyExit9):
1163	mov	(%rsi), %rcx
1164	mov	8(%rsi), %dl
1165	mov	%rcx, (%rdi)
1166	mov	%dl, 8(%rdi)
1167#  ifdef USE_AS_STPCPY
1168	lea	9(%rdi), %rax
1169#  endif
1170#  ifdef USE_AS_STRCAT
1171	xor	%ch, %ch
1172	movb	%ch, 9(%rdi)
1173#  endif
1174	ret
1175
1176	.p2align 4
1177L(StrncpyExit10):
1178	mov	(%rsi), %rcx
1179	mov	8(%rsi), %dx
1180	mov	%rcx, (%rdi)
1181	mov	%dx, 8(%rdi)
1182#  ifdef USE_AS_STPCPY
1183	lea	10(%rdi), %rax
1184#  endif
1185#  ifdef USE_AS_STRCAT
1186	xor	%ch, %ch
1187	movb	%ch, 10(%rdi)
1188#  endif
1189	ret
1190
1191	.p2align 4
1192L(StrncpyExit11):
1193	mov	(%rsi), %rcx
1194	mov	7(%rsi), %edx
1195	mov	%rcx, (%rdi)
1196	mov	%edx, 7(%rdi)
1197#  ifdef USE_AS_STPCPY
1198	lea	11(%rdi), %rax
1199#  endif
1200#  ifdef USE_AS_STRCAT
1201	xor	%ch, %ch
1202	movb	%ch, 11(%rdi)
1203#  endif
1204	ret
1205
1206	.p2align 4
1207L(StrncpyExit12):
1208	mov	(%rsi), %rcx
1209	mov	8(%rsi), %edx
1210	mov	%rcx, (%rdi)
1211	mov	%edx, 8(%rdi)
1212#  ifdef USE_AS_STPCPY
1213	lea	12(%rdi), %rax
1214#  endif
1215#  ifdef USE_AS_STRCAT
1216	xor	%ch, %ch
1217	movb	%ch, 12(%rdi)
1218#  endif
1219	ret
1220
1221	.p2align 4
1222L(StrncpyExit13):
1223	mov	(%rsi), %rcx
1224	mov	5(%rsi), %rdx
1225	mov	%rcx, (%rdi)
1226	mov	%rdx, 5(%rdi)
1227#  ifdef USE_AS_STPCPY
1228	lea	13(%rdi), %rax
1229#  endif
1230#  ifdef USE_AS_STRCAT
1231	xor	%ch, %ch
1232	movb	%ch, 13(%rdi)
1233#  endif
1234	ret
1235
1236	.p2align 4
1237L(StrncpyExit14):
1238	mov	(%rsi), %rcx
1239	mov	6(%rsi), %rdx
1240	mov	%rcx, (%rdi)
1241	mov	%rdx, 6(%rdi)
1242#  ifdef USE_AS_STPCPY
1243	lea	14(%rdi), %rax
1244#  endif
1245#  ifdef USE_AS_STRCAT
1246	xor	%ch, %ch
1247	movb	%ch, 14(%rdi)
1248#  endif
1249	ret
1250
1251	.p2align 4
1252L(StrncpyExit15):
1253	mov	(%rsi), %rcx
1254	mov	7(%rsi), %rdx
1255	mov	%rcx, (%rdi)
1256	mov	%rdx, 7(%rdi)
1257#  ifdef USE_AS_STPCPY
1258	lea	15(%rdi), %rax
1259#  endif
1260#  ifdef USE_AS_STRCAT
1261	xor	%ch, %ch
1262	movb	%ch, 15(%rdi)
1263#  endif
1264	ret
1265
1266	.p2align 4
1267L(StrncpyExit16):
1268	movdqu	(%rsi), %xmm0
1269	movdqu	%xmm0, (%rdi)
1270#  ifdef USE_AS_STPCPY
1271	lea	16(%rdi), %rax
1272#  endif
1273#  ifdef USE_AS_STRCAT
1274	xor	%ch, %ch
1275	movb	%ch, 16(%rdi)
1276#  endif
1277	ret
1278
1279	.p2align 4
1280L(StrncpyExit17):
1281	movdqu	(%rsi), %xmm0
1282	mov	16(%rsi), %cl
1283	movdqu	%xmm0, (%rdi)
1284	mov	%cl, 16(%rdi)
1285#  ifdef USE_AS_STPCPY
1286	lea	17(%rdi), %rax
1287#  endif
1288#  ifdef USE_AS_STRCAT
1289	xor	%ch, %ch
1290	movb	%ch, 17(%rdi)
1291#  endif
1292	ret
1293
1294	.p2align 4
1295L(StrncpyExit18):
1296	movdqu	(%rsi), %xmm0
1297	mov	16(%rsi), %cx
1298	movdqu	%xmm0, (%rdi)
1299	mov	%cx, 16(%rdi)
1300#  ifdef USE_AS_STPCPY
1301	lea	18(%rdi), %rax
1302#  endif
1303#  ifdef USE_AS_STRCAT
1304	xor	%ch, %ch
1305	movb	%ch, 18(%rdi)
1306#  endif
1307	ret
1308
1309	.p2align 4
1310L(StrncpyExit19):
1311	movdqu	(%rsi), %xmm0
1312	mov	15(%rsi), %ecx
1313	movdqu	%xmm0, (%rdi)
1314	mov	%ecx, 15(%rdi)
1315#  ifdef USE_AS_STPCPY
1316	lea	19(%rdi), %rax
1317#  endif
1318#  ifdef USE_AS_STRCAT
1319	xor	%ch, %ch
1320	movb	%ch, 19(%rdi)
1321#  endif
1322	ret
1323
1324	.p2align 4
1325L(StrncpyExit20):
1326	movdqu	(%rsi), %xmm0
1327	mov	16(%rsi), %ecx
1328	movdqu	%xmm0, (%rdi)
1329	mov	%ecx, 16(%rdi)
1330#  ifdef USE_AS_STPCPY
1331	lea	20(%rdi), %rax
1332#  endif
1333#  ifdef USE_AS_STRCAT
1334	xor	%ch, %ch
1335	movb	%ch, 20(%rdi)
1336#  endif
1337	ret
1338
1339	.p2align 4
1340L(StrncpyExit21):
1341	movdqu	(%rsi), %xmm0
1342	mov	16(%rsi), %ecx
1343	mov	20(%rsi), %dl
1344	movdqu	%xmm0, (%rdi)
1345	mov	%ecx, 16(%rdi)
1346	mov	%dl, 20(%rdi)
1347#  ifdef USE_AS_STPCPY
1348	lea	21(%rdi), %rax
1349#  endif
1350#  ifdef USE_AS_STRCAT
1351	xor	%ch, %ch
1352	movb	%ch, 21(%rdi)
1353#  endif
1354	ret
1355
1356	.p2align 4
1357L(StrncpyExit22):
1358	movdqu	(%rsi), %xmm0
1359	mov	14(%rsi), %rcx
1360	movdqu	%xmm0, (%rdi)
1361	mov	%rcx, 14(%rdi)
1362#  ifdef USE_AS_STPCPY
1363	lea	22(%rdi), %rax
1364#  endif
1365#  ifdef USE_AS_STRCAT
1366	xor	%ch, %ch
1367	movb	%ch, 22(%rdi)
1368#  endif
1369	ret
1370
1371	.p2align 4
1372L(StrncpyExit23):
1373	movdqu	(%rsi), %xmm0
1374	mov	15(%rsi), %rcx
1375	movdqu	%xmm0, (%rdi)
1376	mov	%rcx, 15(%rdi)
1377#  ifdef USE_AS_STPCPY
1378	lea	23(%rdi), %rax
1379#  endif
1380#  ifdef USE_AS_STRCAT
1381	xor	%ch, %ch
1382	movb	%ch, 23(%rdi)
1383#  endif
1384	ret
1385
1386	.p2align 4
1387L(StrncpyExit24):
1388	movdqu	(%rsi), %xmm0
1389	mov	16(%rsi), %rcx
1390	movdqu	%xmm0, (%rdi)
1391	mov	%rcx, 16(%rdi)
1392#  ifdef USE_AS_STPCPY
1393	lea	24(%rdi), %rax
1394#  endif
1395#  ifdef USE_AS_STRCAT
1396	xor	%ch, %ch
1397	movb	%ch, 24(%rdi)
1398#  endif
1399	ret
1400
1401	.p2align 4
1402L(StrncpyExit25):
1403	movdqu	(%rsi), %xmm0
1404	mov	16(%rsi), %rdx
1405	mov	24(%rsi), %cl
1406	movdqu	%xmm0, (%rdi)
1407	mov	%rdx, 16(%rdi)
1408	mov	%cl, 24(%rdi)
1409#  ifdef USE_AS_STPCPY
1410	lea	25(%rdi), %rax
1411#  endif
1412#  ifdef USE_AS_STRCAT
1413	xor	%ch, %ch
1414	movb	%ch, 25(%rdi)
1415#  endif
1416	ret
1417
1418	.p2align 4
1419L(StrncpyExit26):
1420	movdqu	(%rsi), %xmm0
1421	mov	16(%rsi), %rdx
1422	mov	24(%rsi), %cx
1423	movdqu	%xmm0, (%rdi)
1424	mov	%rdx, 16(%rdi)
1425	mov	%cx, 24(%rdi)
1426#  ifdef USE_AS_STPCPY
1427	lea	26(%rdi), %rax
1428#  endif
1429#  ifdef USE_AS_STRCAT
1430	xor	%ch, %ch
1431	movb	%ch, 26(%rdi)
1432#  endif
1433	ret
1434
1435	.p2align 4
1436L(StrncpyExit27):
1437	movdqu	(%rsi), %xmm0
1438	mov	16(%rsi), %rdx
1439	mov	23(%rsi), %ecx
1440	movdqu	%xmm0, (%rdi)
1441	mov	%rdx, 16(%rdi)
1442	mov	%ecx, 23(%rdi)
1443#  ifdef USE_AS_STPCPY
1444	lea	27(%rdi), %rax
1445#  endif
1446#  ifdef USE_AS_STRCAT
1447	xor	%ch, %ch
1448	movb	%ch, 27(%rdi)
1449#  endif
1450	ret
1451
1452	.p2align 4
1453L(StrncpyExit28):
1454	movdqu	(%rsi), %xmm0
1455	mov	16(%rsi), %rdx
1456	mov	24(%rsi), %ecx
1457	movdqu	%xmm0, (%rdi)
1458	mov	%rdx, 16(%rdi)
1459	mov	%ecx, 24(%rdi)
1460#  ifdef USE_AS_STPCPY
1461	lea	28(%rdi), %rax
1462#  endif
1463#  ifdef USE_AS_STRCAT
1464	xor	%ch, %ch
1465	movb	%ch, 28(%rdi)
1466#  endif
1467	ret
1468
1469	.p2align 4
1470L(StrncpyExit29):
1471	movdqu	(%rsi), %xmm0
1472	movdqu	13(%rsi), %xmm2
1473	movdqu	%xmm0, (%rdi)
1474	movdqu	%xmm2, 13(%rdi)
1475#  ifdef USE_AS_STPCPY
1476	lea	29(%rdi), %rax
1477#  endif
1478#  ifdef USE_AS_STRCAT
1479	xor	%ch, %ch
1480	movb	%ch, 29(%rdi)
1481#  endif
1482	ret
1483
1484	.p2align 4
1485L(StrncpyExit30):
1486	movdqu	(%rsi), %xmm0
1487	movdqu	14(%rsi), %xmm2
1488	movdqu	%xmm0, (%rdi)
1489	movdqu	%xmm2, 14(%rdi)
1490#  ifdef USE_AS_STPCPY
1491	lea	30(%rdi), %rax
1492#  endif
1493#  ifdef USE_AS_STRCAT
1494	xor	%ch, %ch
1495	movb	%ch, 30(%rdi)
1496#  endif
1497	ret
1498
1499	.p2align 4
1500L(StrncpyExit31):
1501	movdqu	(%rsi), %xmm0
1502	movdqu	15(%rsi), %xmm2
1503	movdqu	%xmm0, (%rdi)
1504	movdqu	%xmm2, 15(%rdi)
1505#  ifdef USE_AS_STPCPY
1506	lea	31(%rdi), %rax
1507#  endif
1508#  ifdef USE_AS_STRCAT
1509	xor	%ch, %ch
1510	movb	%ch, 31(%rdi)
1511#  endif
1512	ret
1513
1514	.p2align 4
1515L(StrncpyExit32):
1516	movdqu	(%rsi), %xmm0
1517	movdqu	16(%rsi), %xmm2
1518	movdqu	%xmm0, (%rdi)
1519	movdqu	%xmm2, 16(%rdi)
1520#  ifdef USE_AS_STPCPY
1521	lea	32(%rdi), %rax
1522#  endif
1523#  ifdef USE_AS_STRCAT
1524	xor	%ch, %ch
1525	movb	%ch, 32(%rdi)
1526#  endif
1527	ret
1528
1529	.p2align 4
1530L(StrncpyExit33):
1531	movdqu	(%rsi), %xmm0
1532	movdqu	16(%rsi), %xmm2
1533	mov	32(%rsi), %cl
1534	movdqu	%xmm0, (%rdi)
1535	movdqu	%xmm2, 16(%rdi)
1536	mov	%cl, 32(%rdi)
1537#  ifdef USE_AS_STRCAT
1538	xor	%ch, %ch
1539	movb	%ch, 33(%rdi)
1540#  endif
1541	ret
1542
1543#  ifndef USE_AS_STRCAT
1544
1545	.p2align 4
1546L(Fill0):
1547	ret
1548
1549	.p2align 4
1550L(Fill1):
1551	mov	%dl, (%rdi)
1552	ret
1553
1554	.p2align 4
1555L(Fill2):
1556	mov	%dx, (%rdi)
1557	ret
1558
1559	.p2align 4
1560L(Fill3):
1561	mov	%edx, -1(%rdi)
1562	ret
1563
1564	.p2align 4
1565L(Fill4):
1566	mov	%edx, (%rdi)
1567	ret
1568
1569	.p2align 4
1570L(Fill5):
1571	mov	%edx, (%rdi)
1572	mov	%dl, 4(%rdi)
1573	ret
1574
1575	.p2align 4
1576L(Fill6):
1577	mov	%edx, (%rdi)
1578	mov	%dx, 4(%rdi)
1579	ret
1580
1581	.p2align 4
1582L(Fill7):
1583	mov	%rdx, -1(%rdi)
1584	ret
1585
1586	.p2align 4
1587L(Fill8):
1588	mov	%rdx, (%rdi)
1589	ret
1590
1591	.p2align 4
1592L(Fill9):
1593	mov	%rdx, (%rdi)
1594	mov	%dl, 8(%rdi)
1595	ret
1596
1597	.p2align 4
1598L(Fill10):
1599	mov	%rdx, (%rdi)
1600	mov	%dx, 8(%rdi)
1601	ret
1602
1603	.p2align 4
1604L(Fill11):
1605	mov	%rdx, (%rdi)
1606	mov	%edx, 7(%rdi)
1607	ret
1608
1609	.p2align 4
1610L(Fill12):
1611	mov	%rdx, (%rdi)
1612	mov	%edx, 8(%rdi)
1613	ret
1614
1615	.p2align 4
1616L(Fill13):
1617	mov	%rdx, (%rdi)
1618	mov	%rdx, 5(%rdi)
1619	ret
1620
1621	.p2align 4
1622L(Fill14):
1623	mov	%rdx, (%rdi)
1624	mov	%rdx, 6(%rdi)
1625	ret
1626
1627	.p2align 4
1628L(Fill15):
1629	movdqu	%xmm0, -1(%rdi)
1630	ret
1631
1632	.p2align 4
1633L(Fill16):
1634	movdqu	%xmm0, (%rdi)
1635	ret
1636
1637	.p2align 4
1638L(CopyFrom1To16BytesUnalignedXmm2):
1639	movdqu	%xmm2, (%rdi, %rcx)
1640
1641	.p2align 4
1642L(CopyFrom1To16BytesXmmExit):
1643	bsf	%rdx, %rdx
1644	add	$15, %r8
1645	add	%rcx, %rdi
1646#   ifdef USE_AS_STPCPY
1647	lea	(%rdi, %rdx), %rax
1648#   endif
1649	sub	%rdx, %r8
1650	lea	1(%rdi, %rdx), %rdi
1651
1652	.p2align 4
1653L(StrncpyFillTailWithZero):
1654	pxor	%xmm0, %xmm0
1655	xor	%rdx, %rdx
1656	sub	$16, %r8
1657	jbe	L(StrncpyFillExit)
1658
1659	movdqu	%xmm0, (%rdi)
1660	add	$16, %rdi
1661
1662	mov	%rdi, %rsi
1663	and	$0xf, %rsi
1664	sub	%rsi, %rdi
1665	add	%rsi, %r8
1666	sub	$64, %r8
1667	jb	L(StrncpyFillLess64)
1668
1669L(StrncpyFillLoopMovdqa):
1670	movdqa	%xmm0, (%rdi)
1671	movdqa	%xmm0, 16(%rdi)
1672	movdqa	%xmm0, 32(%rdi)
1673	movdqa	%xmm0, 48(%rdi)
1674	add	$64, %rdi
1675	sub	$64, %r8
1676	jae	L(StrncpyFillLoopMovdqa)
1677
1678L(StrncpyFillLess64):
1679	add	$32, %r8
1680	jl	L(StrncpyFillLess32)
1681	movdqa	%xmm0, (%rdi)
1682	movdqa	%xmm0, 16(%rdi)
1683	add	$32, %rdi
1684	sub	$16, %r8
1685	jl	L(StrncpyFillExit)
1686	movdqa	%xmm0, (%rdi)
1687	add	$16, %rdi
1688	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1689
1690L(StrncpyFillLess32):
1691	add	$16, %r8
1692	jl	L(StrncpyFillExit)
1693	movdqa	%xmm0, (%rdi)
1694	add	$16, %rdi
1695	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1696
1697L(StrncpyFillExit):
1698	add	$16, %r8
1699	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1700
1701/* end of ifndef USE_AS_STRCAT */
1702#  endif
1703
1704	.p2align 4
1705L(UnalignedLeaveCase2OrCase3):
1706	test	%rdx, %rdx
1707	jnz	L(Unaligned64LeaveCase2)
1708L(Unaligned64LeaveCase3):
1709	lea	64(%r8), %rcx
1710	and	$-16, %rcx
1711	add	$48, %r8
1712	jl	L(CopyFrom1To16BytesCase3)
1713	movdqu	%xmm4, (%rdi)
1714	sub	$16, %r8
1715	jb	L(CopyFrom1To16BytesCase3)
1716	movdqu	%xmm5, 16(%rdi)
1717	sub	$16, %r8
1718	jb	L(CopyFrom1To16BytesCase3)
1719	movdqu	%xmm6, 32(%rdi)
1720	sub	$16, %r8
1721	jb	L(CopyFrom1To16BytesCase3)
1722	movdqu	%xmm7, 48(%rdi)
1723#  ifdef USE_AS_STPCPY
1724	lea	64(%rdi), %rax
1725#  endif
1726#  ifdef USE_AS_STRCAT
1727	xor	%ch, %ch
1728	movb	%ch, 64(%rdi)
1729#  endif
1730	ret
1731
1732	.p2align 4
1733L(Unaligned64LeaveCase2):
1734	xor	%rcx, %rcx
1735	pcmpeqb	%xmm4, %xmm0
1736	pmovmskb %xmm0, %rdx
1737	add	$48, %r8
1738	jle	L(CopyFrom1To16BytesCase2OrCase3)
1739	test	%rdx, %rdx
1740#  ifndef USE_AS_STRCAT
1741	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
1742#  else
1743	jnz	L(CopyFrom1To16Bytes)
1744#  endif
1745	pcmpeqb	%xmm5, %xmm0
1746	pmovmskb %xmm0, %rdx
1747	movdqu	%xmm4, (%rdi)
1748	add	$16, %rcx
1749	sub	$16, %r8
1750	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1751	test	%rdx, %rdx
1752#  ifndef USE_AS_STRCAT
1753	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
1754#  else
1755	jnz	L(CopyFrom1To16Bytes)
1756#  endif
1757
1758	pcmpeqb	%xmm6, %xmm0
1759	pmovmskb %xmm0, %rdx
1760	movdqu	%xmm5, 16(%rdi)
1761	add	$16, %rcx
1762	sub	$16, %r8
1763	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1764	test	%rdx, %rdx
1765#  ifndef USE_AS_STRCAT
1766	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
1767#  else
1768	jnz	L(CopyFrom1To16Bytes)
1769#  endif
1770
1771	pcmpeqb	%xmm7, %xmm0
1772	pmovmskb %xmm0, %rdx
1773	movdqu	%xmm6, 32(%rdi)
1774	lea	16(%rdi, %rcx), %rdi
1775	lea	16(%rsi, %rcx), %rsi
1776	bsf	%rdx, %rdx
1777	cmp	%r8, %rdx
1778	jb	L(CopyFrom1To16BytesExit)
1779	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
1780
1781	.p2align 4
1782L(ExitZero):
1783#  ifndef USE_AS_STRCAT
1784	mov	%rdi, %rax
1785#  endif
1786	ret
1787
1788# endif
1789
1790# ifndef USE_AS_STRCAT
1791END (STRCPY)
1792# else
1793END (STRCAT)
1794# endif
1795	.p2align 4
1796	.section .rodata
1797L(ExitTable):
1798	.int	JMPTBL(L(Exit1), L(ExitTable))
1799	.int	JMPTBL(L(Exit2), L(ExitTable))
1800	.int	JMPTBL(L(Exit3), L(ExitTable))
1801	.int	JMPTBL(L(Exit4), L(ExitTable))
1802	.int	JMPTBL(L(Exit5), L(ExitTable))
1803	.int	JMPTBL(L(Exit6), L(ExitTable))
1804	.int	JMPTBL(L(Exit7), L(ExitTable))
1805	.int	JMPTBL(L(Exit8), L(ExitTable))
1806	.int	JMPTBL(L(Exit9), L(ExitTable))
1807	.int	JMPTBL(L(Exit10), L(ExitTable))
1808	.int	JMPTBL(L(Exit11), L(ExitTable))
1809	.int	JMPTBL(L(Exit12), L(ExitTable))
1810	.int	JMPTBL(L(Exit13), L(ExitTable))
1811	.int	JMPTBL(L(Exit14), L(ExitTable))
1812	.int	JMPTBL(L(Exit15), L(ExitTable))
1813	.int	JMPTBL(L(Exit16), L(ExitTable))
1814	.int	JMPTBL(L(Exit17), L(ExitTable))
1815	.int	JMPTBL(L(Exit18), L(ExitTable))
1816	.int	JMPTBL(L(Exit19), L(ExitTable))
1817	.int	JMPTBL(L(Exit20), L(ExitTable))
1818	.int	JMPTBL(L(Exit21), L(ExitTable))
1819	.int	JMPTBL(L(Exit22), L(ExitTable))
1820	.int    JMPTBL(L(Exit23), L(ExitTable))
1821	.int	JMPTBL(L(Exit24), L(ExitTable))
1822	.int	JMPTBL(L(Exit25), L(ExitTable))
1823	.int	JMPTBL(L(Exit26), L(ExitTable))
1824	.int	JMPTBL(L(Exit27), L(ExitTable))
1825	.int	JMPTBL(L(Exit28), L(ExitTable))
1826	.int	JMPTBL(L(Exit29), L(ExitTable))
1827	.int	JMPTBL(L(Exit30), L(ExitTable))
1828	.int	JMPTBL(L(Exit31), L(ExitTable))
1829	.int	JMPTBL(L(Exit32), L(ExitTable))
1830# ifdef USE_AS_STRNCPY
1831L(ExitStrncpyTable):
1832	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
1833	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
1834	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
1835	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
1836	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
1837	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
1838	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
1839	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
1840	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
1841	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
1842	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
1843	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
1844	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
1845	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
1846	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
1847	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
1848	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
1849	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
1850	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
1851	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
1852	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
1853	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
1854	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
1855	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
1856	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
1857	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
1858	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
1859	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
1860	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
1861	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
1862	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
1863	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
1864	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
1865	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
1866#  ifndef USE_AS_STRCAT
1867	.p2align 4
1868L(FillTable):
1869	.int	JMPTBL(L(Fill0), L(FillTable))
1870	.int	JMPTBL(L(Fill1), L(FillTable))
1871	.int	JMPTBL(L(Fill2), L(FillTable))
1872	.int	JMPTBL(L(Fill3), L(FillTable))
1873	.int	JMPTBL(L(Fill4), L(FillTable))
1874	.int	JMPTBL(L(Fill5), L(FillTable))
1875	.int	JMPTBL(L(Fill6), L(FillTable))
1876	.int	JMPTBL(L(Fill7), L(FillTable))
1877	.int	JMPTBL(L(Fill8), L(FillTable))
1878	.int	JMPTBL(L(Fill9), L(FillTable))
1879	.int	JMPTBL(L(Fill10), L(FillTable))
1880	.int	JMPTBL(L(Fill11), L(FillTable))
1881	.int	JMPTBL(L(Fill12), L(FillTable))
1882	.int	JMPTBL(L(Fill13), L(FillTable))
1883	.int	JMPTBL(L(Fill14), L(FillTable))
1884	.int	JMPTBL(L(Fill15), L(FillTable))
1885	.int	JMPTBL(L(Fill16), L(FillTable))
1886#  endif
1887# endif
1888#endif
1889