1/* Optimized mempcpy implementation for POWER7.
2   Copyright (C) 2010-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22	Returns 'dst' + 'len'.  */
23
24	.machine  power7
25EALIGN (__mempcpy, 5, 0)
26	CALL_MCOUNT
27
28	stwu	1,-32(1)
29	cfi_adjust_cfa_offset(32)
30	stw	30,20(1)
31	cfi_offset(30,(20-32))
32	stw	31,24(1)
33	mr	30,3
34	cmplwi	cr1,5,31
35	neg	0,3
36	cfi_offset(31,-8)
37	ble	cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
38					code.  */
39
40	andi.	11,3,7	      /* Check alignment of DST.  */
41	clrlwi	10,4,29	      /* Check alignment of SRC.  */
42	cmplw	cr6,10,11     /* SRC and DST alignments match?  */
43	mr	12,4
44	mr	31,5
45	bne	cr6,L(copy_GE_32_unaligned)
46
47	srwi	9,5,3	      /* Number of full quadwords remaining.  */
48
49	beq	L(copy_GE_32_aligned_cont)
50
51	clrlwi	0,0,29
52	mtcrf	0x01,0
53	subf	31,0,5
54
55	/* Get the SRC aligned to 8 bytes.  */
56
571:	bf	31,2f
58	lbz	6,0(12)
59	addi	12,12,1
60	stb	6,0(3)
61	addi	3,3,1
622:	bf	30,4f
63	lhz	6,0(12)
64	addi	12,12,2
65	sth	6,0(3)
66	addi	3,3,2
674:	bf	29,0f
68	lwz	6,0(12)
69	addi	12,12,4
70	stw	6,0(3)
71	addi	3,3,4
720:
73	clrlwi	10,12,29      /* Check alignment of SRC again.  */
74	srwi	9,31,3	      /* Number of full doublewords remaining.  */
75
76L(copy_GE_32_aligned_cont):
77
78	clrlwi	11,31,29
79	mtcrf	0x01,9
80
81	srwi	8,31,5
82	cmplwi	cr1,9,4
83	cmplwi	cr6,11,0
84	mr	11,12
85
86	/* Copy 1~3 doublewords so the main loop starts
87	at a multiple of 32 bytes.  */
88
89	bf	30,1f
90	lfd	6,0(12)
91	lfd	7,8(12)
92	addi	11,12,16
93	mtctr	8
94	stfd	6,0(3)
95	stfd	7,8(3)
96	addi	10,3,16
97	bf	31,4f
98	lfd	0,16(12)
99	stfd	0,16(3)
100	blt	cr1,3f
101	addi	11,12,24
102	addi	10,3,24
103	b	4f
104
105	.align	4
1061:	/* Copy 1 doubleword and set the counter.  */
107	mr	10,3
108	mtctr	8
109	bf	31,4f
110	lfd	6,0(12)
111	addi	11,12,8
112	stfd	6,0(3)
113	addi	10,3,8
114
115	.align	4
1164:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
117	lfd	6,0(11)
118	lfd	7,8(11)
119	lfd	8,16(11)
120	lfd	0,24(11)
121	addi	11,11,32
122
123	stfd	6,0(10)
124	stfd	7,8(10)
125	stfd	8,16(10)
126	stfd	0,24(10)
127	addi	10,10,32
128	bdnz	4b
1293:
130
131	/* Check for tail bytes.  */
132
133	clrrwi	0,31,3
134	mtcrf	0x01,31
135	beq	cr6,0f
136
137.L9:
138	add	3,3,0
139	add	12,12,0
140
141	/*  At this point we have a tail of 0-7 bytes and we know that the
142	destination is doubleword-aligned.  */
1434:	/* Copy 4 bytes.  */
144	bf	29,2f
145
146	lwz	6,0(12)
147	addi	12,12,4
148	stw	6,0(3)
149	addi	3,3,4
1502:	/* Copy 2 bytes.  */
151	bf	30,1f
152
153	lhz	6,0(12)
154	addi	12,12,2
155	sth	6,0(3)
156	addi	3,3,2
1571:	/* Copy 1 byte.  */
158	bf	31,0f
159
160	lbz	6,0(12)
161	stb	6,0(3)
1620:	/* Return DST + LEN pointer.  */
163	add	3,30,5
164	lwz	30,20(1)
165	lwz	31,24(1)
166	addi	1,1,32
167	blr
168
169	/* Handle copies of 0~31 bytes.  */
170	.align	4
171L(copy_LT_32):
172	cmplwi	cr6,5,8
173	mr	12,4
174	mtcrf	0x01,5
175	ble	cr6,L(copy_LE_8)
176
177	/* At least 9 bytes to go.  */
178	neg	8,4
179	clrrwi	11,4,2
180	andi.	0,8,3
181	cmplwi	cr1,5,16
182	mr	10,5
183	beq	L(copy_LT_32_aligned)
184
185	/* Force 4-bytes alignment for SRC.  */
186	mtocrf  0x01,0
187	subf	10,0,5
1882:	bf	30,1f
189
190	lhz	6,0(12)
191	addi	12,12,2
192	sth	6,0(3)
193	addi	3,3,2
1941:	bf	31,L(end_4bytes_alignment)
195
196	lbz	6,0(12)
197	addi	12,12,1
198	stb	6,0(3)
199	addi	3,3,1
200
201	.align	4
202L(end_4bytes_alignment):
203	cmplwi	cr1,10,16
204	mtcrf	0x01,10
205
206L(copy_LT_32_aligned):
207	/* At least 6 bytes to go, and SRC is word-aligned.  */
208	blt	cr1,8f
209
210	/* Copy 16 bytes.  */
211	lwz	6,0(12)
212	lwz	7,4(12)
213	stw	6,0(3)
214	lwz	8,8(12)
215	stw	7,4(3)
216	lwz	6,12(12)
217	addi	12,12,16
218	stw	8,8(3)
219	stw	6,12(3)
220	addi	3,3,16
2218:	/* Copy 8 bytes.  */
222	bf	28,4f
223
224	lwz	6,0(12)
225	lwz	7,4(12)
226	addi	12,12,8
227	stw	6,0(3)
228	stw	7,4(3)
229	addi	3,3,8
2304:	/* Copy 4 bytes.  */
231	bf	29,2f
232
233	lwz	6,0(12)
234	addi	12,12,4
235	stw	6,0(3)
236	addi	3,3,4
2372:	/* Copy 2-3 bytes.  */
238	bf	30,1f
239
240	lhz	6,0(12)
241	sth	6,0(3)
242	bf	31,0f
243	lbz	7,2(12)
244	stb	7,2(3)
245
246	/* Return DST + LEN pointer.  */
247	add	3,30,5
248	lwz	30,20(1)
249	addi	1,1,32
250	blr
251
252	.align	4
2531:	/* Copy 1 byte.  */
254	bf	31,0f
255
256	lbz	6,0(12)
257	stb	6,0(3)
2580:	/* Return DST + LEN pointer.  */
259	add	3,30,5
260	lwz	30,20(1)
261	addi	1,1,32
262	blr
263
264	/* Handles copies of 0~8 bytes.  */
265	.align	4
266L(copy_LE_8):
267	bne	cr6,4f
268
269	/* Though we could've used lfd/stfd here, they are still
270	slow for unaligned cases.  */
271
272	lwz	6,0(4)
273	lwz	7,4(4)
274	stw	6,0(3)
275	stw	7,4(3)
276
277	/* Return DST + LEN pointer.  */
278	add	3,30,5
279	lwz	30,20(1)
280	addi	1,1,32
281	blr
282
283	.align	4
2844:	/* Copies 4~7 bytes.  */
285	bf	29,2b
286
287	lwz	6,0(4)
288	stw	6,0(3)
289	bf	30,5f
290	lhz	7,4(4)
291	sth	7,4(3)
292	bf	31,0f
293	lbz	8,6(4)
294	stb	8,6(3)
295
296	/* Return DST + LEN pointer.  */
297	add	3,30,5
298	lwz	30,20(1)
299	addi	1,1,32
300	blr
301
302	.align	4
3035:	/* Copy 1 byte.  */
304	bf	31,0f
305
306	lbz	6,4(4)
307	stb	6,4(3)
308
3090:	/* Return DST + LEN pointer.  */
310	add	3,30,5
311	lwz	30,20(1)
312	addi	1,1,32
313	blr
314
315	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
316	SRC is not. Use aligned quadword loads from SRC, shifted to realign
317	the data, allowing for aligned DST stores.  */
318	.align	4
319L(copy_GE_32_unaligned):
320	andi.	11,3,15	      /* Check alignment of DST.  */
321	clrlwi	0,0,28	      /* Number of bytes until the 1st
322				 quadword of DST.  */
323	srwi	9,5,4	      /* Number of full quadwords remaining.  */
324
325	beq	L(copy_GE_32_unaligned_cont)
326
327	/* DST is not quadword aligned, get it aligned.  */
328
329	mtcrf	0x01,0
330	subf	31,0,5
331
332	/* Vector instructions work best when proper alignment (16-bytes)
333	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
3341:	/* Copy 1 byte.  */
335	bf	31,2f
336
337	lbz	6,0(12)
338	addi	12,12,1
339	stb	6,0(3)
340	addi	3,3,1
3412:	/* Copy 2 bytes.  */
342	bf		30,4f
343
344	lhz	6,0(12)
345	addi	12,12,2
346	sth	6,0(3)
347	addi	3,3,2
3484:	/* Copy 4 bytes.  */
349	bf	29,8f
350
351	lwz	6,0(12)
352	addi	12,12,4
353	stw	6,0(3)
354	addi	3,3,4
3558:	/* Copy 8 bytes.  */
356	bf	28,0f
357
358	lfd	6,0(12)
359	addi	12,12,8
360	stfd	6,0(3)
361	addi	3,3,8
3620:
363	clrlwi	10,12,28      /* Check alignment of SRC.  */
364	srwi	9,31,4	      /* Number of full quadwords remaining.  */
365
366	/* The proper alignment is present, it is OK to copy the bytes now.  */
367L(copy_GE_32_unaligned_cont):
368
369	/* Setup two indexes to speed up the indexed vector operations.  */
370	clrlwi	11,31,28
371	li	6,16	      /* Index for 16-bytes offsets.  */
372	li	7,32	      /* Index for 32-bytes offsets.  */
373	cmplwi	cr1,11,0
374	srwi	8,31,5	      /* Setup the loop counter.  */
375	mr	10,3
376	mr	11,12
377	mtcrf	0x01,9
378	cmplwi	cr6,9,1
379#ifdef __LITTLE_ENDIAN__
380	lvsr    5,0,12
381#else
382	lvsl    5,0,12
383#endif
384	lvx	3,0,12
385	bf	31,L(setup_unaligned_loop)
386
387	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
388	lvx	4,12,6
389#ifdef __LITTLE_ENDIAN__
390	vperm   6,4,3,5
391#else
392	vperm   6,3,4,5
393#endif
394	addi	11,12,16
395	addi	10,3,16
396	stvx	6,0,3
397	vor	3,4,4
398
399L(setup_unaligned_loop):
400	mtctr	8
401	ble	cr6,L(end_unaligned_loop)
402
403	/* Copy 32 bytes at a time using vector instructions.  */
404	.align	4
405L(unaligned_loop):
406
407	/* Note: vr6/vr10 may contain data that was already copied,
408	but in order to get proper alignment, we may have to copy
409	some portions again. This is faster than having unaligned
410	vector instructions though.  */
411
412	lvx	4,11,6	      /* vr4 = r11+16.  */
413#ifdef __LITTLE_ENDIAN__
414	vperm   6,4,3,5
415#else
416	vperm   6,3,4,5
417#endif
418	lvx	3,11,7	      /* vr3 = r11+32.  */
419#ifdef __LITTLE_ENDIAN__
420	vperm   10,3,4,5
421#else
422	vperm   10,4,3,5
423#endif
424	addi	11,11,32
425	stvx	6,0,10
426	stvx	10,10,6
427	addi	10,10,32
428
429	bdnz	L(unaligned_loop)
430
431	.align	4
432L(end_unaligned_loop):
433
434	/* Check for tail bytes.  */
435	clrrwi	0,31,4
436	mtcrf	0x01,31
437	beq	cr1,0f
438
439	add	3,3,0
440	add	12,12,0
441
442	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
4438:	/* Copy 8 bytes.  */
444	bf	28,4f
445
446	lwz	6,0(12)
447	lwz	7,4(12)
448	addi	12,12,8
449	stw	6,0(3)
450	stw	7,4(3)
451	addi	3,3,8
4524:	/* Copy 4 bytes.  */
453	bf	29,2f
454
455	lwz	6,0(12)
456	addi	12,12,4
457	stw	6,0(3)
458	addi	3,3,4
4592:	/* Copy 2~3 bytes.  */
460	bf	30,1f
461
462	lhz	6,0(12)
463	addi	12,12,2
464	sth	6,0(3)
465	addi	3,3,2
4661:	/* Copy 1 byte.  */
467	bf	31,0f
468
469	lbz	6,0(12)
470	stb	6,0(3)
4710:	/* Return DST + LEN pointer.  */
472	add	3,30,5
473	lwz	30,20(1)
474	lwz	31,24(1)
475	addi	1,1,32
476	blr
477
478END (__mempcpy)
479libc_hidden_def (__mempcpy)
480weak_alias (__mempcpy, mempcpy)
481libc_hidden_builtin_def (mempcpy)
482