1/* Optimized memcpy implementation for PowerPC32 on POWER6.
2   Copyright (C) 2003-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22   Returns 'dst'.
23
24   Memcpy handles short copies (< 32-bytes) using a binary move blocks
25   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
26   with the appropriate combination of byte and halfword load/stores.
27   There is minimal effort to optimize the alignment of short moves.
28
29   Longer moves (>= 32-bytes) justify the effort to get at least the
30   destination word (4-byte) aligned.  Further optimization is
31   possible when both source and destination are word aligned.
32   Each case has an optimized unrolled loop.   */
33
34	.machine power6
35EALIGN (memcpy, 5, 0)
36	CALL_MCOUNT
37
38    stwu   1,-32(1)
39    cfi_adjust_cfa_offset(32)
40    cmplwi cr1,5,31     /* check for short move.  */
41    neg    0,3
42    cmplwi cr1,5,31
43    clrlwi 10,4,30	/* check alignment of src.  */
44    andi.  11,3,3	/* check alignment of dst.  */
45    clrlwi 0,0,30	/* Number of bytes until the 1st word of dst.  */
46    ble-   cr1,L(word_unaligned_short)	/* If move < 32 bytes.  */
47    cmplw  cr6,10,11
48    stw    31,24(1)
49    stw    30,20(1)
50    cfi_offset(31,(24-32))
51    cfi_offset(30,(20-32))
52    mr     30,3
53    beq    .L0
54    mtcrf  0x01,0
55    subf  31,0,5        /* Length after alignment.  */
56    add   12,4,0        /* Compute src addr after alignment.  */
57  /* Move 0-3 bytes as needed to get the destination word aligned.  */
581:  bf    31,2f
59    lbz   6,0(4)
60    bf    30,3f
61    lhz   7,1(4)
62    stb   6,0(3)
63    sth   7,1(3)
64    addi  3,3,3
65    b     0f
663:
67    stb   6,0(3)
68    addi  3,3,1
69    b     0f
702:  bf    30,0f
71    lhz   6,0(4)
72    sth   6,0(3)
73    addi  3,3,2
740:
75    clrlwi 10,12,30	/* check alignment of src again.  */
76    srwi   9,31,2	/* Number of full words remaining.  */
77    bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
78    clrlwi 11,31,30  /* calculate the number of tail bytes */
79    b      L(word_aligned)
80  /* Copy words from source to destination, assuming the destination is
81     aligned on a word boundary.
82
83     At this point we know there are at least 29 bytes left (32-3) to copy.
84     The next step is to determine if the source is also word aligned.
85     If not branch to the unaligned move code at .L6. which uses
86     a load, shift, store strategy.
87
88     Otherwise source and destination are word aligned, and we can use
89     the optimized word copy loop.  */
90    .align  4
91.L0:
92    mr     31,5
93    mr     12,4
94    bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
95    srwi   9,5,2	/* Number of full words remaining.  */
96    clrlwi 11,5,30      /* calculate the number of tail bytes */
97
98  /* Move words where destination and source are word aligned.
99     Use an unrolled loop to copy 4 words (16-bytes) per iteration.
100     If the copy is not an exact multiple of 16 bytes, 1-3
101     words are copied as needed to set up the main loop.  After
102     the main loop exits there may be a tail of 1-3 bytes. These bytes are
103     copied a halfword/byte at a time as needed to preserve alignment.  */
104L(word_aligned):
105    mtcrf 0x01,9
106    srwi  8,31,4    /* calculate the 16 byte loop count */
107    cmplwi	cr1,9,4
108    cmplwi	cr6,11,0
109    mr    11,12
110
111    bf    30,1f
112    lwz   6,0(12)
113    lwz   7,4(12)
114    addi  11,12,8
115    mtctr 8
116    stw   6,0(3)
117    stw   7,4(3)
118    addi  10,3,8
119    bf    31,4f
120    lwz   0,8(12)
121    stw   0,8(3)
122    blt   cr1,3f
123    addi  11,12,12
124    addi  10,3,12
125    b     4f
126    .align  4
1271:
128    mr    10,3
129    mtctr 8
130    bf    31,4f
131    lwz   6,0(12)
132    addi  11,12,4
133    stw   6,0(3)
134    addi  10,3,4
135
136    .align  4
1374:
138    lwz   6,0(11)
139    lwz   7,4(11)
140    lwz   8,8(11)
141    lwz   0,12(11)
142    stw   6,0(10)
143    stw   7,4(10)
144    stw   8,8(10)
145    stw   0,12(10)
146    addi  11,11,16
147    addi  10,10,16
148    bdnz  4b
1493:
150    clrrwi 0,31,2
151    mtcrf 0x01,31
152    beq   cr6,0f
153.L9:
154    add   3,3,0
155    add   12,12,0
156
157/*  At this point we have a tail of 0-3 bytes and we know that the
158    destination is word aligned.  */
1592:  bf    30,1f
160    lhz   6,0(12)
161    addi  12,12,2
162    sth   6,0(3)
163    addi  3,3,2
1641:  bf    31,0f
165    lbz   6,0(12)
166    stb   6,0(3)
1670:
168  /* Return original dst pointer.  */
169    mr  3,30
170    lwz 30,20(1)
171    lwz 31,24(1)
172    addi 1,1,32
173    blr
174
175/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
176   bytes.  Each case is handled without loops, using binary (1,2,4,8)
177   tests.
178
179   In the short (0-8 byte) case no attempt is made to force alignment
180   of either source or destination.  The hardware will handle the
181   unaligned load/stores with small delays for crossing 32- 128-byte,
182   and 4096-byte boundaries. Since these short moves are unlikely to be
183   unaligned or cross these boundaries, the overhead to force
184   alignment is not justified.
185
186   The longer (9-31 byte) move is more likely to cross 32- or 128-byte
187   boundaries.  Since only loads are sensitive to the 32-/128-byte
188   boundaries it is more important to align the source then the
189   destination.  If the source is not already word aligned, we first
190   move 1-3 bytes as needed.  Since we are only word aligned we don't
191   use double word load/stores to insure that all loads are aligned.
192   While the destination and stores may still be unaligned, this
193   is only an issue for page (4096 byte boundary) crossing, which
194   should be rare for these short moves.  The hardware handles this
195   case automatically with a small (~20 cycle) delay.  */
196    .align  4
197
198    cfi_same_value (31)
199    cfi_same_value (30)
200L(word_unaligned_short):
201    mtcrf 0x01,5
202    cmplwi cr6,5,8
203    neg   8,4
204    clrrwi	9,4,2
205    andi. 0,8,3
206    beq   cr6,L(wus_8)	/* Handle moves of 8 bytes.  */
207/* At least 9 bytes left.  Get the source word aligned.  */
208    cmplwi	cr1,5,16
209    mr    12,4
210    ble   cr6,L(wus_4)  /* Handle moves of 0-8 bytes.  */
211    mr    11,3
212    mr    10,5
213    cmplwi	cr6,0,2
214    beq   L(wus_tail)	/* If the source is already word aligned skip this.  */
215/* Copy 1-3 bytes to get source address word aligned.  */
216    lwz   6,0(9)
217    subf  10,0,5
218    add   12,4,0
219    blt   cr6,5f
220    srwi  7,6,16
221    bgt	  cr6,3f
222#ifdef __LITTLE_ENDIAN__
223    sth   7,0(3)
224#else
225    sth   6,0(3)
226#endif
227    b     7f
228    .align  4
2293:
230#ifdef __LITTLE_ENDIAN__
231    rotlwi 6,6,24
232    stb   6,0(3)
233    sth   7,1(3)
234#else
235    stb   7,0(3)
236    sth   6,1(3)
237#endif
238    b     7f
239    .align  4
2405:
241#ifdef __LITTLE_ENDIAN__
242    rotlwi 6,6,8
243#endif
244    stb   6,0(3)
2457:
246    cmplwi	cr1,10,16
247    add   11,3,0
248    mtcrf 0x01,10
249    .align  4
250L(wus_tail):
251/* At least 6 bytes left and the source is word aligned.  This allows
252   some speculative loads up front.  */
253/* We need to special case the fall-through because the biggest delays
254   are due to address computation not being ready in time for the
255   AGEN.  */
256    lwz   6,0(12)
257    lwz   7,4(12)
258    blt   cr1,L(wus_tail8)
259    cmplwi	cr0,10,24
260L(wus_tail16): /* Move 16 bytes.  */
261    stw   6,0(11)
262    stw   7,4(11)
263    lwz   6,8(12)
264    lwz   7,12(12)
265    stw   6,8(11)
266    stw   7,12(11)
267/* Move 8 bytes more.  */
268    bf    28,L(wus_tail16p8)
269    cmplwi	cr1,10,28
270    lwz   6,16(12)
271    lwz   7,20(12)
272    stw   6,16(11)
273    stw   7,20(11)
274/* Move 4 bytes more.  */
275    bf    29,L(wus_tail16p4)
276    lwz   6,24(12)
277    stw   6,24(11)
278    addi  12,12,28
279    addi  11,11,28
280    bgt   cr1,L(wus_tail2)
281 /* exactly 28 bytes.  Return original dst pointer and exit.  */
282    addi  1,1,32
283    blr
284    .align  4
285L(wus_tail16p8):  /* less than 8 bytes left.  */
286    beq   cr1,L(wus_tailX) /* exactly 16 bytes, early exit.  */
287    cmplwi	cr1,10,20
288    bf    29,L(wus_tail16p2)
289/* Move 4 bytes more.  */
290    lwz   6,16(12)
291    stw   6,16(11)
292    addi  12,12,20
293    addi  11,11,20
294    bgt   cr1,L(wus_tail2)
295 /* exactly 20 bytes.  Return original dst pointer and exit.  */
296    addi  1,1,32
297    blr
298    .align  4
299L(wus_tail16p4):  /* less than 4 bytes left.  */
300    addi  12,12,24
301    addi  11,11,24
302    bgt   cr0,L(wus_tail2)
303 /* exactly 24 bytes.  Return original dst pointer and exit.  */
304    addi  1,1,32
305    blr
306    .align  4
307L(wus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
308    addi  12,12,16
309    addi  11,11,16
310    b     L(wus_tail2)
311
312    .align  4
313L(wus_tail8):  /* Move 8 bytes.  */
314/*  r6, r7 already loaded speculatively.  */
315    cmplwi	cr1,10,8
316    cmplwi	cr0,10,12
317    bf    28,L(wus_tail4)
318    stw   6,0(11)
319    stw   7,4(11)
320/* Move 4 bytes more.  */
321    bf    29,L(wus_tail8p4)
322    lwz   6,8(12)
323    stw   6,8(11)
324    addi  12,12,12
325    addi  11,11,12
326    bgt   cr0,L(wus_tail2)
327 /* exactly 12 bytes.  Return original dst pointer and exit.  */
328    addi  1,1,32
329    blr
330    .align  4
331L(wus_tail8p4):  /* less than 4 bytes left.  */
332    addi  12,12,8
333    addi  11,11,8
334    bgt   cr1,L(wus_tail2)
335 /* exactly 8 bytes.  Return original dst pointer and exit.  */
336    addi  1,1,32
337    blr
338
339    .align  4
340L(wus_tail4):  /* Move 4 bytes.  */
341/*  r6 already loaded speculatively.  If we are here we know there is
342    more than 4 bytes left.  So there is no need to test.  */
343    addi  12,12,4
344    stw   6,0(11)
345    addi  11,11,4
346L(wus_tail2):  /* Move 2-3 bytes.  */
347    bf    30,L(wus_tail1)
348    lhz   6,0(12)
349    sth   6,0(11)
350    bf    31,L(wus_tailX)
351    lbz   7,2(12)
352    stb   7,2(11)
353    addi  1,1,32
354    blr
355L(wus_tail1):  /* Move 1 byte.  */
356    bf    31,L(wus_tailX)
357    lbz   6,0(12)
358    stb   6,0(11)
359L(wus_tailX):
360  /* Return original dst pointer.  */
361    addi  1,1,32
362    blr
363
364/* Special case to copy 0-8 bytes.  */
365    .align  4
366L(wus_8):
367    lwz   6,0(4)
368    lwz   7,4(4)
369    stw   6,0(3)
370    stw   7,4(3)
371  /* Return original dst pointer.  */
372    addi  1,1,32
373    blr
374    .align  4
375L(wus_4):
376    bf    29,L(wus_2)
377    lwz   6,0(4)
378    stw   6,0(3)
379    bf    30,L(wus_5)
380    lhz   7,4(4)
381    sth   7,4(3)
382    bf    31,L(wus_0)
383    lbz   8,6(4)
384    stb   8,6(3)
385    addi  1,1,32
386    blr
387    .align  4
388L(wus_5):
389    bf    31,L(wus_0)
390    lbz   6,4(4)
391    stb   6,4(3)
392  /* Return original dst pointer.  */
393    addi 1,1,32
394    blr
395    .align  4
396L(wus_2):  /* Move 2-3 bytes.  */
397    bf    30,L(wus_1)
398    lhz   6,0(4)
399    sth   6,0(3)
400    bf    31,L(wus_0)
401    lbz   7,2(4)
402    stb   7,2(3)
403    addi  1,1,32
404    blr
405    .align  4
406L(wus_1):  /* Move 1 byte.  */
407    bf    31,L(wus_0)
408    lbz   6,0(4)
409    stb   6,0(3)
410    .align  3
411L(wus_0):
412  /* Return original dst pointer.  */
413    addi  1,1,32
414    blr
415
416    .align  4
417    cfi_offset(31,(24-32))
418    cfi_offset(30,(20-32))
419L(wdu):
420
421  /* Copy words where the destination is aligned but the source is
422     not.  For power4, power5 and power6 machines there is penalty for
423     unaligned loads (src) that cross 32-byte, cacheline, or page
424     boundaries. So we want to use simple (unaligned) loads where
425     possible but avoid them where we know the load would span a 32-byte
426     boundary.
427
428     At this point we know we have at least 29 (32-3) bytes to copy
429     the src is unaligned. and we may cross at least one 32-byte
430     boundary. Also we have the following register values:
431     r3 == adjusted dst, word aligned
432     r4 == unadjusted src
433     r5 == unadjusted len
434     r9 == adjusted Word length
435     r10 == src alignment (1-3)
436     r12 == adjusted src, not aligned
437     r31 == adjusted len
438
439     First we need to copy word up to but not crossing the next 32-byte
440     boundary. Then perform aligned loads just before and just after
441     the boundary and use shifts and or to generate the next aligned
442     word for dst. If more than 32 bytes remain we copy (unaligned src)
443     the next 7 words and repeat the loop until less than 32-bytes
444     remain.
445
446     Then if more than 4 bytes remain we again use aligned loads,
447     shifts and or to generate the next dst word. We then process the
448     remaining words using unaligned loads as needed. Finally we check
449     if there are more than 0 bytes (1-3) bytes remaining and use
450     halfword and or byte load/stores to complete the copy.
451*/
452    mr      4,12      /* restore unaligned adjusted src ptr */
453    clrlwi  0,12,27   /* Find dist from previous 32-byte boundary.  */
454    slwi    10,10,3   /* calculate number of bits to shift 1st word left */
455    cmplwi  cr5,0,16
456    subfic  8,0,32   /* Number of bytes to next 32-byte boundary.  */
457
458    mtcrf   0x01,8
459    cmplwi  cr1,10,16
460    subfic  9,10,32  /* number of bits to shift 2nd word right */
461/*  This test is reversed because the timing to compare the bytes to
462    32-byte boundary could not be meet.  So we compare the bytes from
463    previous 32-byte boundary and invert the test.  */
464    bge     cr5,L(wdu_h32_8)
465    .align  4
466    lwz   6,0(4)
467    lwz   7,4(4)
468    addi  12,4,16    /* generate alternate pointers to avoid agen */
469    addi  11,3,16    /* timing issues downstream.  */
470    stw   6,0(3)
471    stw   7,4(3)
472    subi  31,31,16
473    lwz   6,8(4)
474    lwz   7,12(4)
475    addi  4,4,16
476    stw   6,8(3)
477    stw   7,12(3)
478    addi  3,3,16
479    bf    28,L(wdu_h32_4)
480    lwz   6,0(12)
481    lwz   7,4(12)
482    subi  31,31,8
483    addi  4,4,8
484    stw   6,0(11)
485    stw   7,4(11)
486    addi  3,3,8
487    bf    29,L(wdu_h32_0)
488    lwz   6,8(12)
489    addi  4,4,4
490    subi  31,31,4
491    stw   6,8(11)
492    addi  3,3,4
493    b     L(wdu_h32_0)
494    .align  4
495L(wdu_h32_8):
496    bf    28,L(wdu_h32_4)
497    lwz   6,0(4)
498    lwz   7,4(4)
499    subi  31,31,8
500    bf    29,L(wdu_h32_8x)
501    stw   6,0(3)
502    stw   7,4(3)
503    lwz   6,8(4)
504    addi  4,4,12
505    subi  31,31,4
506    stw   6,8(3)
507    addi  3,3,12
508    b     L(wdu_h32_0)
509    .align  4
510L(wdu_h32_8x):
511    addi  4,4,8
512    stw   6,0(3)
513    stw   7,4(3)
514    addi  3,3,8
515    b     L(wdu_h32_0)
516    .align  4
517L(wdu_h32_4):
518    bf    29,L(wdu_h32_0)
519    lwz   6,0(4)
520    subi  31,31,4
521    addi  4,4,4
522    stw   6,0(3)
523    addi  3,3,4
524    .align  4
525L(wdu_h32_0):
526/*  set up for 32-byte boundary crossing word move and possibly 32-byte
527    move loop.  */
528    clrrwi  12,4,2
529    cmplwi  cr5,31,32
530    bge     cr1,L(wdu2_32)
531#if 0
532    b       L(wdu1_32)
533/*
534    cmplwi  cr1,10,8
535    beq     cr1,L(wdu1_32)
536    cmplwi  cr1,10,16
537    beq     cr1,L(wdu2_32)
538    cmplwi  cr1,10,24
539    beq     cr1,L(wdu3_32)
540*/
541L(wdu_32):
542    lwz     6,0(12)
543    cmplwi  cr6,31,4
544    srwi    8,31,5    /* calculate the 32 byte loop count */
545    slw     0,6,10
546    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
547    blt     cr5,L(wdu_32tail)
548    mtctr   8
549    cmplwi  cr6,31,4
550    .align  4
551L(wdu_loop32):
552    /* copy 32 bytes at a time */
553    lwz   8,4(12)
554    addi  12,12,32
555    lwz   7,4(4)
556    srw   8,8,9
557    or    0,0,8
558    stw   0,0(3)
559    stw   7,4(3)
560    lwz   6,8(4)
561    lwz   7,12(4)
562    stw   6,8(3)
563    stw   7,12(3)
564    lwz   6,16(4)
565    lwz   7,20(4)
566    stw   6,16(3)
567    stw   7,20(3)
568    lwz   6,24(4)
569    lwz   7,28(4)
570    lwz   8,0(12)
571    addi  4,4,32
572    stw   6,24(3)
573    stw   7,28(3)
574    addi  3,3,32
575    slw   0,8,10
576    bdnz+ L(wdu_loop32)
577
578L(wdu_32tail):
579    mtcrf   0x01,31
580    cmplwi  cr5,31,16
581    blt     cr6,L(wdu_4tail)
582    /* calculate and store the final word */
583    lwz   8,4(12)
584    srw   8,8,9
585    or    6,0,8
586    b     L(wdu_32tailx)
587#endif
588    .align  4
589L(wdu1_32):
590    lwz     6,-1(4)
591    cmplwi  cr6,31,4
592    srwi    8,31,5    /* calculate the 32 byte loop count */
593#ifdef __LITTLE_ENDIAN__
594    srwi    6,6,8
595#else
596    slwi    6,6,8
597#endif
598    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
599    blt     cr5,L(wdu1_32tail)
600    mtctr   8
601    cmplwi  cr6,31,4
602
603    lwz   8,3(4)
604    lwz   7,4(4)
605#ifdef __LITTLE_ENDIAN__
606    rldimi 6,8,24,32
607#else
608/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
609    rlwimi 6,8,8,(32-8),31
610#endif
611    b      L(wdu1_loop32x)
612    .align  4
613L(wdu1_loop32):
614    /* copy 32 bytes at a time */
615    lwz   8,3(4)
616    lwz   7,4(4)
617    stw   10,-8(3)
618    stw   11,-4(3)
619#ifdef __LITTLE_ENDIAN__
620    rldimi 6,8,24,32
621#else
622/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
623    rlwimi 6,8,8,(32-8),31
624#endif
625L(wdu1_loop32x):
626    lwz   10,8(4)
627    lwz   11,12(4)
628    stw   6,0(3)
629    stw   7,4(3)
630    lwz   6,16(4)
631    lwz   7,20(4)
632    stw   10,8(3)
633    stw   11,12(3)
634    lwz   10,24(4)
635    lwz   11,28(4)
636    lwz   8,32-1(4)
637    addi  4,4,32
638    stw   6,16(3)
639    stw   7,20(3)
640    addi  3,3,32
641#ifdef __LITTLE_ENDIAN__
642    srwi  6,8,8
643#else
644    slwi  6,8,8
645#endif
646    bdnz+ L(wdu1_loop32)
647    stw   10,-8(3)
648    stw   11,-4(3)
649
650L(wdu1_32tail):
651    mtcrf   0x01,31
652    cmplwi  cr5,31,16
653    blt     cr6,L(wdu_4tail)
654    /* calculate and store the final word */
655    lwz   8,3(4)
656#ifdef __LITTLE_ENDIAN__
657    rldimi 6,8,24,32
658#else
659/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
660    rlwimi 6,8,8,(32-8),31
661#endif
662    b     L(wdu_32tailx)
663
664L(wdu2_32):
665    bgt     cr1,L(wdu3_32)
666    lwz     6,-2(4)
667    cmplwi  cr6,31,4
668    srwi    8,31,5    /* calculate the 32 byte loop count */
669#ifdef __LITTLE_ENDIAN__
670    srwi    6,6,16
671#else
672    slwi    6,6,16
673#endif
674    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
675    blt     cr5,L(wdu2_32tail)
676    mtctr   8
677    cmplwi  cr6,31,4
678
679    lwz   8,2(4)
680    lwz   7,4(4)
681#ifdef __LITTLE_ENDIAN__
682    rldimi 6,8,16,32
683#else
684    rlwimi 6,8,16,(32-16),31
685#endif
686    b      L(wdu2_loop32x)
687    .align  4
688L(wdu2_loop32):
689    /* copy 32 bytes at a time */
690    lwz   8,2(4)
691    lwz   7,4(4)
692    stw   10,-8(3)
693    stw   11,-4(3)
694#ifdef __LITTLE_ENDIAN__
695    rldimi 6,8,16,32
696#else
697    rlwimi 6,8,16,(32-16),31
698#endif
699L(wdu2_loop32x):
700    lwz   10,8(4)
701    lwz   11,12(4)
702    stw   6,0(3)
703    stw   7,4(3)
704    lwz   6,16(4)
705    lwz   7,20(4)
706    stw   10,8(3)
707    stw   11,12(3)
708    lwz   10,24(4)
709    lwz   11,28(4)
710/*    lwz   8,0(12) */
711    lwz   8,32-2(4)
712    addi  4,4,32
713    stw   6,16(3)
714    stw   7,20(3)
715    addi  3,3,32
716#ifdef __LITTLE_ENDIAN__
717    srwi  6,8,16
718#else
719    slwi  6,8,16
720#endif
721    bdnz+ L(wdu2_loop32)
722    stw   10,-8(3)
723    stw   11,-4(3)
724
725L(wdu2_32tail):
726    mtcrf   0x01,31
727    cmplwi  cr5,31,16
728    blt     cr6,L(wdu_4tail)
729    /* calculate and store the final word */
730    lwz   8,2(4)
731#ifdef __LITTLE_ENDIAN__
732    rldimi 6,8,16,32
733#else
734    rlwimi 6,8,16,(32-16),31
735#endif
736    b     L(wdu_32tailx)
737
738L(wdu3_32):
739/*    lwz     6,0(12) */
740    lwz     6,-3(4)
741    cmplwi  cr6,31,4
742    srwi    8,31,5    /* calculate the 32 byte loop count */
743#ifdef __LITTLE_ENDIAN__
744    srwi    6,6,24
745#else
746    slwi    6,6,24
747#endif
748    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
749    blt     cr5,L(wdu3_32tail)
750    mtctr   8
751    cmplwi  cr6,31,4
752
753    lwz   8,1(4)
754    lwz   7,4(4)
755#ifdef __LITTLE_ENDIAN__
756    rldimi 6,8,8,32
757#else
758    rlwimi 6,8,24,(32-24),31
759#endif
760    b      L(wdu3_loop32x)
761    .align  4
762L(wdu3_loop32):
763    /* copy 32 bytes at a time */
764    lwz   8,1(4)
765    lwz   7,4(4)
766    stw   10,-8(3)
767    stw   11,-4(3)
768#ifdef __LITTLE_ENDIAN__
769    rldimi 6,8,8,32
770#else
771    rlwimi 6,8,24,(32-24),31
772#endif
773L(wdu3_loop32x):
774    lwz   10,8(4)
775    lwz   11,12(4)
776    stw   6,0(3)
777    stw   7,4(3)
778    lwz   6,16(4)
779    lwz   7,20(4)
780    stw   10,8(3)
781    stw   11,12(3)
782    lwz   10,24(4)
783    lwz   11,28(4)
784    lwz   8,32-3(4)
785    addi  4,4,32
786    stw   6,16(3)
787    stw   7,20(3)
788    addi  3,3,32
789#ifdef __LITTLE_ENDIAN__
790    srwi  6,8,24
791#else
792    slwi  6,8,24
793#endif
794    bdnz+ L(wdu3_loop32)
795    stw   10,-8(3)
796    stw   11,-4(3)
797
798L(wdu3_32tail):
799    mtcrf   0x01,31
800    cmplwi  cr5,31,16
801    blt     cr6,L(wdu_4tail)
802    /* calculate and store the final word */
803    lwz   8,1(4)
804#ifdef __LITTLE_ENDIAN__
805    rldimi 6,8,8,32
806#else
807    rlwimi 6,8,24,(32-24),31
808#endif
809    b     L(wdu_32tailx)
810    .align  4
811L(wdu_32tailx):
812    blt     cr5,L(wdu_t32_8)
813    lwz   7,4(4)
814    addi  12,4,16    /* generate alternate pointers to avoid agen */
815    addi  11,3,16    /* timing issues downstream.  */
816    stw   6,0(3)
817    stw   7,4(3)
818    subi  31,31,16
819    lwz   6,8(4)
820    lwz   7,12(4)
821    addi  4,4,16
822    stw   6,8(3)
823    stw   7,12(3)
824    addi  3,3,16
825    bf    28,L(wdu_t32_4x)
826    lwz   6,0(12)
827    lwz   7,4(12)
828    addi  4,4,8
829    subi  31,31,8
830    stw   6,0(11)
831    stw   7,4(11)
832    addi  3,3,8
833    bf    29,L(wdu_t32_0)
834    lwz   6,8(12)
835    addi  4,4,4
836    subi  31,31,4
837    stw   6,8(11)
838    addi  3,3,4
839    b     L(wdu_t32_0)
840    .align  4
841L(wdu_t32_4x):
842    bf    29,L(wdu_t32_0)
843    lwz   6,0(4)
844    addi  4,4,4
845    subi  31,31,4
846    stw   6,0(3)
847    addi  3,3,4
848    b     L(wdu_t32_0)
849    .align  4
850L(wdu_t32_8):
851    bf    28,L(wdu_t32_4)
852    lwz   7,4(4)
853    subi  31,31,8
854    bf    29,L(wdu_t32_8x)
855    stw   6,0(3)
856    stw   7,4(3)
857    lwz   6,8(4)
858    subi  31,31,4
859    addi  4,4,12
860    stw   6,8(3)
861    addi  3,3,12
862    b     L(wdu_t32_0)
863    .align  4
864L(wdu_t32_8x):
865    addi  4,4,8
866    stw   6,0(3)
867    stw   7,4(3)
868    addi  3,3,8
869    b     L(wdu_t32_0)
870    .align  4
871L(wdu_t32_4):
872    subi  31,31,4
873    stw   6,0(3)
874    addi  4,4,4
875    addi  3,3,4
876    .align  4
877L(wdu_t32_0):
878L(wdu_4tail):
879    cmplwi  cr6,31,0
880    beq   cr6,L(wdus_0)	/* If the tail is 0 bytes we are done!  */
881    bf    30,L(wdus_3)
882    lhz   7,0(4)
883    sth   7,0(3)
884    bf    31,L(wdus_0)
885    lbz   8,2(4)
886    stb   8,2(3)
887    mr    3,30
888    lwz   30,20(1)
889    lwz   31,24(1)
890    addi  1,1,32
891    blr
892    .align  4
893L(wdus_3):
894    bf    31,L(wus_0)
895    lbz   6,0(4)
896    stb   6,0(3)
897    .align  4
898L(wdus_0):
899  /* Return original dst pointer.  */
900    mr   3,30
901    lwz  30,20(1)
902    lwz  31,24(1)
903    addi 1,1,32
904    blr
905END (memcpy)
906
907libc_hidden_builtin_def (memcpy)
908