1/* Optimized memcmp implementation for POWER7/PowerPC64.
2   Copyright (C) 2010-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* int [r3] memcmp (const char *s1 [r3],
22		    const char *s2 [r4],
23		    size_t size [r5])  */
24#ifndef MEMCMP
25# define MEMCMP memcmp
26#endif
27	.machine power7
28ENTRY_TOCLESS (MEMCMP, 4)
29	CALL_MCOUNT 3
30
31#define rRTN		r3
32#define rSTR1		r3	/* first string arg */
33#define rSTR2		r4	/* second string arg */
34#define rN		r5	/* max string length */
35#define rWORD1		r6	/* current word in s1 */
36#define rWORD2		r7	/* current word in s2 */
37#define rWORD3		r8	/* next word in s1 */
38#define rWORD4		r9	/* next word in s2 */
39#define rWORD5		r10	/* next word in s1 */
40#define rWORD6		r11	/* next word in s2 */
41
42#define rOFF8		r20	/* 8 bytes offset.  */
43#define rOFF16  	r21	/* 16 bytes offset.  */
44#define rOFF24		r22	/* 24 bytes offset.  */
45#define rOFF32		r23	/* 24 bytes offset.  */
46#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
47#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
48#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
49#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
50#define rSHR		r28	/* Unaligned shift right count.  */
51#define rSHL		r29	/* Unaligned shift left count.  */
52#define rWORD7		r30	/* next word in s1 */
53#define rWORD8		r31	/* next word in s2 */
54
55#define rWORD8SAVE	(-8)
56#define rWORD7SAVE	(-16)
57#define rOFF8SAVE	(-24)
58#define rOFF16SAVE	(-32)
59#define rOFF24SAVE	(-40)
60#define rOFF32SAVE	(-48)
61#define rSHRSAVE	(-56)
62#define rSHLSAVE	(-64)
63#define rWORD8SHIFTSAVE	(-72)
64#define rWORD2SHIFTSAVE	(-80)
65#define rWORD4SHIFTSAVE	(-88)
66#define rWORD6SHIFTSAVE	(-96)
67
68#ifdef __LITTLE_ENDIAN__
69# define LD	ldbrx
70#else
71# define LD	ldx
72#endif
73
74	xor	r0, rSTR2, rSTR1
75	cmpldi	cr6, rN, 0
76	cmpldi	cr1, rN, 12
77	clrldi.	r0, r0, 61
78	clrldi	r12, rSTR1, 61
79	cmpldi	cr5, r12, 0
80	beq-	cr6, L(zeroLength)
81	dcbt	0, rSTR1
82	dcbt	0, rSTR2
83/* If less than 8 bytes or not aligned, use the unaligned
84   byte loop.  */
85	blt	cr1, L(bytealigned)
86	std	rWORD8, rWORD8SAVE(r1)
87	std	rWORD7, rWORD7SAVE(r1)
88	std	rOFF8, rOFF8SAVE(r1)
89	std	rOFF16, rOFF16SAVE(r1)
90	std	rOFF24, rOFF24SAVE(r1)
91	std	rOFF32, rOFF32SAVE(r1)
92	cfi_offset(rWORD8, rWORD8SAVE)
93	cfi_offset(rWORD7, rWORD7SAVE)
94	cfi_offset(rOFF8, rOFF8SAVE)
95	cfi_offset(rOFF16, rOFF16SAVE)
96	cfi_offset(rOFF24, rOFF24SAVE)
97	cfi_offset(rOFF32, rOFF32SAVE)
98
99	li	rOFF8,8
100	li	rOFF16,16
101	li	rOFF24,24
102	li	rOFF32,32
103
104	bne	L(unaligned)
105/* At this point we know both strings have the same alignment and the
106   compare length is at least 8 bytes.  r12 contains the low order
107   3 bits of rSTR1 and cr5 contains the result of the logical compare
108   of r12 to 0.  If r12 == 0 then we are already double word
109   aligned and can perform the DW aligned loop.
110
111   Otherwise we know the two strings have the same alignment (but not
112   yet DW).  So we force the string addresses to the next lower DW
113   boundary and special case this first DW using shift left to
114   eliminate bits preceding the first byte.  Since we want to join the
115   normal (DW aligned) compare loop, starting at the second double word,
116   we need to adjust the length (rN) and special case the loop
117   versioning for the first DW. This ensures that the loop count is
118   correct and the first DW (shifted) is in the expected register pair.  */
119	.align	4
120L(samealignment):
121	clrrdi	rSTR1, rSTR1, 3
122	clrrdi	rSTR2, rSTR2, 3
123	beq	cr5, L(DWaligned)
124	add	rN, rN, r12
125	sldi	rWORD6, r12, 3
126	srdi	r0, rN, 5	/* Divide by 32 */
127	andi.	r12, rN, 24	/* Get the DW remainder */
128	LD	rWORD1, 0, rSTR1
129	LD	rWORD2, 0, rSTR2
130	cmpldi	cr1, r12, 16
131	cmpldi	cr7, rN, 32
132	clrldi	rN, rN, 61
133	beq	L(dPs4)
134	mtctr	r0
135	bgt	cr1, L(dPs3)
136	beq	cr1, L(dPs2)
137
138/* Remainder is 8 */
139	.align	3
140L(dsP1):
141	sld	rWORD5, rWORD1, rWORD6
142	sld	rWORD6, rWORD2, rWORD6
143	cmpld	cr5, rWORD5, rWORD6
144	blt	cr7, L(dP1x)
145/* Do something useful in this cycle since we have to branch anyway.  */
146	LD	rWORD1, rOFF8, rSTR1
147	LD	rWORD2, rOFF8, rSTR2
148	cmpld	cr7, rWORD1, rWORD2
149	b	L(dP1e)
150/* Remainder is 16 */
151	.align	4
152L(dPs2):
153	sld	rWORD5, rWORD1, rWORD6
154	sld	rWORD6, rWORD2, rWORD6
155	cmpld	cr6, rWORD5, rWORD6
156	blt	cr7, L(dP2x)
157/* Do something useful in this cycle since we have to branch anyway.  */
158	LD	rWORD7, rOFF8, rSTR1
159	LD	rWORD8, rOFF8, rSTR2
160	cmpld	cr5, rWORD7, rWORD8
161	b	L(dP2e)
162/* Remainder is 24 */
163	.align	4
164L(dPs3):
165	sld	rWORD3, rWORD1, rWORD6
166	sld	rWORD4, rWORD2, rWORD6
167	cmpld	cr1, rWORD3, rWORD4
168	b	L(dP3e)
169/* Count is a multiple of 32, remainder is 0 */
170	.align	4
171L(dPs4):
172	mtctr	r0
173	sld	rWORD1, rWORD1, rWORD6
174	sld	rWORD2, rWORD2, rWORD6
175	cmpld	cr7, rWORD1, rWORD2
176	b	L(dP4e)
177
178/* At this point we know both strings are double word aligned and the
179   compare length is at least 8 bytes.  */
180	.align	4
181L(DWaligned):
182	andi.	r12, rN, 24	/* Get the DW remainder */
183	srdi	r0, rN, 5	/* Divide by 32 */
184	cmpldi	cr1, r12, 16
185	cmpldi	cr7, rN, 32
186	clrldi	rN, rN, 61
187	beq	L(dP4)
188	bgt	cr1, L(dP3)
189	beq	cr1, L(dP2)
190
191/* Remainder is 8 */
192	.align	4
193L(dP1):
194	mtctr	r0
195/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
196   (8-15 byte compare), we want to use only volatile registers.  This
197   means we can avoid restoring non-volatile registers since we did not
198   change any on the early exit path.  The key here is the non-early
199   exit path only cares about the condition code (cr5), not about which
200   register pair was used.  */
201	LD	rWORD5, 0, rSTR1
202	LD	rWORD6, 0, rSTR2
203	cmpld	cr5, rWORD5, rWORD6
204	blt	cr7, L(dP1x)
205	LD	rWORD1, rOFF8, rSTR1
206	LD	rWORD2, rOFF8, rSTR2
207	cmpld	cr7, rWORD1, rWORD2
208L(dP1e):
209	LD	rWORD3, rOFF16, rSTR1
210	LD	rWORD4, rOFF16, rSTR2
211	cmpld	cr1, rWORD3, rWORD4
212	LD	rWORD5, rOFF24, rSTR1
213	LD	rWORD6, rOFF24, rSTR2
214	cmpld	cr6, rWORD5, rWORD6
215	bne	cr5, L(dLcr5x)
216	bne	cr7, L(dLcr7x)
217
218	LD	rWORD7, rOFF32, rSTR1
219	LD	rWORD8, rOFF32, rSTR2
220	addi	rSTR1, rSTR1, 32
221	addi	rSTR2, rSTR2, 32
222	bne	cr1, L(dLcr1)
223	cmpld	cr5, rWORD7, rWORD8
224	bdnz	L(dLoop)
225	bne	cr6, L(dLcr6)
226	ld	rWORD8, rWORD8SAVE(r1)
227	ld	rWORD7, rWORD7SAVE(r1)
228	.align	3
229L(dP1x):
230	sldi.	r12, rN, 3
231	bne	cr5, L(dLcr5x)
232	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
233	bne	L(d00)
234	ld	rOFF8,  rOFF8SAVE(r1)
235	ld	rOFF16, rOFF16SAVE(r1)
236	ld	rOFF24, rOFF24SAVE(r1)
237	ld	rOFF32, rOFF32SAVE(r1)
238	li	rRTN, 0
239	blr
240
241/* Remainder is 16 */
242	.align	4
243L(dP2):
244	mtctr	r0
245	LD	rWORD5, 0, rSTR1
246	LD	rWORD6, 0, rSTR2
247	cmpld	cr6, rWORD5, rWORD6
248	blt	cr7, L(dP2x)
249	LD	rWORD7, rOFF8, rSTR1
250	LD	rWORD8, rOFF8, rSTR2
251	cmpld	cr5, rWORD7, rWORD8
252L(dP2e):
253	LD	rWORD1, rOFF16, rSTR1
254	LD	rWORD2, rOFF16, rSTR2
255	cmpld	cr7, rWORD1, rWORD2
256	LD	rWORD3, rOFF24, rSTR1
257	LD	rWORD4, rOFF24, rSTR2
258	cmpld	cr1, rWORD3, rWORD4
259	addi	rSTR1, rSTR1, 8
260	addi	rSTR2, rSTR2, 8
261	bne	cr6, L(dLcr6)
262	bne	cr5, L(dLcr5)
263	b	L(dLoop2)
264	.align	4
265L(dP2x):
266	LD	rWORD3, rOFF8, rSTR1
267	LD	rWORD4, rOFF8, rSTR2
268	cmpld	cr1, rWORD3, rWORD4
269	sldi.	r12, rN, 3
270	bne	cr6, L(dLcr6x)
271	addi	rSTR1, rSTR1, 8
272	addi	rSTR2, rSTR2, 8
273	bne	cr1, L(dLcr1x)
274	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
275	bne	L(d00)
276	ld	rOFF8,  rOFF8SAVE(r1)
277	ld	rOFF16, rOFF16SAVE(r1)
278	ld	rOFF24, rOFF24SAVE(r1)
279	ld	rOFF32, rOFF32SAVE(r1)
280	li	rRTN, 0
281	blr
282
283/* Remainder is 24 */
284	.align	4
285L(dP3):
286	mtctr	r0
287	LD	rWORD3, 0, rSTR1
288	LD	rWORD4, 0, rSTR2
289	cmpld	cr1, rWORD3, rWORD4
290L(dP3e):
291	LD	rWORD5, rOFF8, rSTR1
292	LD	rWORD6, rOFF8, rSTR2
293	cmpld	cr6, rWORD5, rWORD6
294	blt	cr7, L(dP3x)
295	LD	rWORD7, rOFF16, rSTR1
296	LD	rWORD8, rOFF16, rSTR2
297	cmpld	cr5, rWORD7, rWORD8
298	LD	rWORD1, rOFF24, rSTR1
299	LD	rWORD2, rOFF24, rSTR2
300	cmpld	cr7, rWORD1, rWORD2
301	addi	rSTR1, rSTR1, 16
302	addi	rSTR2, rSTR2, 16
303	bne	cr1, L(dLcr1)
304	bne	cr6, L(dLcr6)
305	b	L(dLoop1)
306/* Again we are on a early exit path (24-31 byte compare), we want to
307   only use volatile registers and avoid restoring non-volatile
308   registers.  */
309	.align	4
310L(dP3x):
311	LD	rWORD1, rOFF16, rSTR1
312	LD	rWORD2, rOFF16, rSTR2
313	cmpld	cr7, rWORD1, rWORD2
314	sldi.	r12, rN, 3
315	bne	cr1, L(dLcr1x)
316	addi	rSTR1, rSTR1, 16
317	addi	rSTR2, rSTR2, 16
318	bne	cr6, L(dLcr6x)
319	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
320	bne	cr7, L(dLcr7x)
321	bne	L(d00)
322	ld	rOFF8,  rOFF8SAVE(r1)
323	ld	rOFF16, rOFF16SAVE(r1)
324	ld	rOFF24, rOFF24SAVE(r1)
325	ld	rOFF32, rOFF32SAVE(r1)
326	li	rRTN, 0
327	blr
328
329/* Count is a multiple of 32, remainder is 0 */
330	.align	4
331L(dP4):
332	mtctr	r0
333	LD	rWORD1, 0, rSTR1
334	LD	rWORD2, 0, rSTR2
335	cmpld	cr7, rWORD1, rWORD2
336L(dP4e):
337	LD	rWORD3, rOFF8, rSTR1
338	LD	rWORD4, rOFF8, rSTR2
339	cmpld	cr1, rWORD3, rWORD4
340	LD	rWORD5, rOFF16, rSTR1
341	LD	rWORD6, rOFF16, rSTR2
342	cmpld	cr6, rWORD5, rWORD6
343	LD	rWORD7, rOFF24, rSTR1
344	LD	rWORD8, rOFF24, rSTR2
345	addi	rSTR1, rSTR1, 24
346	addi	rSTR2, rSTR2, 24
347	cmpld	cr5, rWORD7, rWORD8
348	bne	cr7, L(dLcr7)
349	bne	cr1, L(dLcr1)
350	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
351/* This is the primary loop */
352	.align	4
353L(dLoop):
354	LD	rWORD1, rOFF8, rSTR1
355	LD	rWORD2, rOFF8, rSTR2
356	cmpld	cr1, rWORD3, rWORD4
357	bne	cr6, L(dLcr6)
358L(dLoop1):
359	LD	rWORD3, rOFF16, rSTR1
360	LD	rWORD4, rOFF16, rSTR2
361	cmpld	cr6, rWORD5, rWORD6
362	bne	cr5, L(dLcr5)
363L(dLoop2):
364	LD	rWORD5, rOFF24, rSTR1
365	LD	rWORD6, rOFF24, rSTR2
366	cmpld	cr5, rWORD7, rWORD8
367	bne	cr7, L(dLcr7)
368L(dLoop3):
369	LD	rWORD7, rOFF32, rSTR1
370	LD	rWORD8, rOFF32, rSTR2
371	addi	rSTR1, rSTR1, 32
372	addi	rSTR2, rSTR2, 32
373	bne	cr1, L(dLcr1)
374	cmpld	cr7, rWORD1, rWORD2
375	bdnz	L(dLoop)
376
377L(dL4):
378	cmpld	cr1, rWORD3, rWORD4
379	bne	cr6, L(dLcr6)
380	cmpld	cr6, rWORD5, rWORD6
381	bne	cr5, L(dLcr5)
382	cmpld	cr5, rWORD7, rWORD8
383L(d44):
384	bne	cr7, L(dLcr7)
385L(d34):
386	bne	cr1, L(dLcr1)
387L(d24):
388	bne	cr6, L(dLcr6)
389L(d14):
390	sldi.	r12, rN, 3
391	bne	cr5, L(dLcr5)
392L(d04):
393	ld	rWORD8, rWORD8SAVE(r1)
394	ld	rWORD7, rWORD7SAVE(r1)
395	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
396	beq	L(duzeroLength)
397/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
398   we are aligned it is safe to load the whole double word, and use
399   shift right double to eliminate bits beyond the compare length.  */
400L(d00):
401	LD	rWORD1, rOFF8, rSTR1
402	LD	rWORD2, rOFF8, rSTR2
403	srd	rWORD1, rWORD1, rN
404	srd	rWORD2, rWORD2, rN
405	cmpld	cr7, rWORD1, rWORD2
406	bne	cr7, L(dLcr7x)
407	ld	rOFF8,  rOFF8SAVE(r1)
408	ld	rOFF16, rOFF16SAVE(r1)
409	ld	rOFF24, rOFF24SAVE(r1)
410	ld	rOFF32, rOFF32SAVE(r1)
411	li	rRTN, 0
412	blr
413
414	.align	4
415L(dLcr7):
416	ld	rWORD8, rWORD8SAVE(r1)
417	ld	rWORD7, rWORD7SAVE(r1)
418L(dLcr7x):
419	ld	rOFF8,  rOFF8SAVE(r1)
420	ld	rOFF16, rOFF16SAVE(r1)
421	ld	rOFF24, rOFF24SAVE(r1)
422	ld	rOFF32, rOFF32SAVE(r1)
423	li	rRTN, 1
424	bgtlr	cr7
425	li	rRTN, -1
426	blr
427	.align	4
428L(dLcr1):
429	ld	rWORD8, rWORD8SAVE(r1)
430	ld	rWORD7, rWORD7SAVE(r1)
431L(dLcr1x):
432	ld	rOFF8,  rOFF8SAVE(r1)
433	ld	rOFF16, rOFF16SAVE(r1)
434	ld	rOFF24, rOFF24SAVE(r1)
435	ld	rOFF32, rOFF32SAVE(r1)
436	li	rRTN, 1
437	bgtlr	cr1
438	li	rRTN, -1
439	blr
440	.align	4
441L(dLcr6):
442	ld	rWORD8, rWORD8SAVE(r1)
443	ld	rWORD7, rWORD7SAVE(r1)
444L(dLcr6x):
445	ld	rOFF8,  rOFF8SAVE(r1)
446	ld	rOFF16, rOFF16SAVE(r1)
447	ld	rOFF24, rOFF24SAVE(r1)
448	ld	rOFF32, rOFF32SAVE(r1)
449	li	rRTN, 1
450	bgtlr	cr6
451	li	rRTN, -1
452	blr
453	.align	4
454L(dLcr5):
455	ld	rWORD8, rWORD8SAVE(r1)
456	ld	rWORD7, rWORD7SAVE(r1)
457L(dLcr5x):
458	ld	rOFF8,  rOFF8SAVE(r1)
459	ld	rOFF16, rOFF16SAVE(r1)
460	ld	rOFF24, rOFF24SAVE(r1)
461	ld	rOFF32, rOFF32SAVE(r1)
462	li	rRTN, 1
463	bgtlr	cr5
464	li	rRTN, -1
465	blr
466
467	.align	4
468L(bytealigned):
469	mtctr	rN
470
471/* We need to prime this loop.  This loop is swing modulo scheduled
472   to avoid pipe delays.  The dependent instruction latencies (load to
473   compare to conditional branch) is 2 to 3 cycles.  In this loop each
474   dispatch group ends in a branch and takes 1 cycle.  Effectively
475   the first iteration of the loop only serves to load operands and
476   branches based on compares are delayed until the next loop.
477
478   So we must precondition some registers and condition codes so that
479   we don't exit the loop early on the first iteration.  */
480
481	lbz	rWORD1, 0(rSTR1)
482	lbz	rWORD2, 0(rSTR2)
483	bdz	L(b11)
484	cmpld	cr7, rWORD1, rWORD2
485	lbz	rWORD3, 1(rSTR1)
486	lbz	rWORD4, 1(rSTR2)
487	bdz	L(b12)
488	cmpld	cr1, rWORD3, rWORD4
489	lbzu	rWORD5, 2(rSTR1)
490	lbzu	rWORD6, 2(rSTR2)
491	bdz	L(b13)
492	.align	4
493L(bLoop):
494	lbzu	rWORD1, 1(rSTR1)
495	lbzu	rWORD2, 1(rSTR2)
496	bne	cr7, L(bLcr7)
497
498	cmpld	cr6, rWORD5, rWORD6
499	bdz	L(b3i)
500
501	lbzu	rWORD3, 1(rSTR1)
502	lbzu	rWORD4, 1(rSTR2)
503	bne	cr1, L(bLcr1)
504
505	cmpld	cr7, rWORD1, rWORD2
506	bdz	L(b2i)
507
508	lbzu	rWORD5, 1(rSTR1)
509	lbzu	rWORD6, 1(rSTR2)
510	bne	cr6, L(bLcr6)
511
512	cmpld	cr1, rWORD3, rWORD4
513	bdnz	L(bLoop)
514
515/* We speculatively loading bytes before we have tested the previous
516   bytes.  But we must avoid overrunning the length (in the ctr) to
517   prevent these speculative loads from causing a segfault.  In this
518   case the loop will exit early (before the all pending bytes are
519   tested.  In this case we must complete the pending operations
520   before returning.  */
521L(b1i):
522	bne	cr7, L(bLcr7)
523	bne	cr1, L(bLcr1)
524	b	L(bx56)
525	.align	4
526L(b2i):
527	bne	cr6, L(bLcr6)
528	bne	cr7, L(bLcr7)
529	b	L(bx34)
530	.align	4
531L(b3i):
532	bne	cr1, L(bLcr1)
533	bne	cr6, L(bLcr6)
534	b	L(bx12)
535	.align	4
536L(bLcr7):
537	li	rRTN, 1
538	bgtlr	cr7
539	li	rRTN, -1
540	blr
541L(bLcr1):
542	li	rRTN, 1
543	bgtlr	cr1
544	li	rRTN, -1
545	blr
546L(bLcr6):
547	li	rRTN, 1
548	bgtlr	cr6
549	li	rRTN, -1
550	blr
551
552L(b13):
553	bne	cr7, L(bx12)
554	bne	cr1, L(bx34)
555L(bx56):
556	sub	rRTN, rWORD5, rWORD6
557	blr
558	nop
559L(b12):
560	bne	cr7, L(bx12)
561L(bx34):
562	sub	rRTN, rWORD3, rWORD4
563	blr
564L(b11):
565L(bx12):
566	sub	rRTN, rWORD1, rWORD2
567	blr
568
569	.align	4
570L(zeroLength):
571	li	rRTN, 0
572	blr
573
574	.align	4
575/* At this point we know the strings have different alignment and the
576   compare length is at least 8 bytes.  r12 contains the low order
577   3 bits of rSTR1 and cr5 contains the result of the logical compare
578   of r12 to 0.  If r12 == 0 then rStr1 is double word
579   aligned and can perform the DWunaligned loop.
580
581   Otherwise we know that rSTR1 is not already DW aligned yet.
582   So we can force the string addresses to the next lower DW
583   boundary and special case this first DW using shift left to
584   eliminate bits preceding the first byte.  Since we want to join the
585   normal (DWaligned) compare loop, starting at the second double word,
586   we need to adjust the length (rN) and special case the loop
587   versioning for the first DW. This ensures that the loop count is
588   correct and the first DW (shifted) is in the expected resister pair.  */
589L(unaligned):
590	std	rSHL, rSHLSAVE(r1)
591	cfi_offset(rSHL, rSHLSAVE)
592	clrldi	rSHL, rSTR2, 61
593	beq	cr6, L(duzeroLength)
594	std	rSHR, rSHRSAVE(r1)
595	cfi_offset(rSHR, rSHRSAVE)
596	beq	cr5, L(DWunaligned)
597	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
598	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
599/* Adjust the logical start of rSTR2 to compensate for the extra bits
600   in the 1st rSTR1 DW.  */
601	sub	rWORD8_SHIFT, rSTR2, r12
602/* But do not attempt to address the DW before that DW that contains
603   the actual start of rSTR2.  */
604	clrrdi	rSTR2, rSTR2, 3
605	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
606/* Compute the left/right shift counts for the unaligned rSTR2,
607   compensating for the logical (DW aligned) start of rSTR1.  */
608	clrldi	rSHL, rWORD8_SHIFT, 61
609	clrrdi	rSTR1, rSTR1, 3
610	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
611	sldi	rSHL, rSHL, 3
612	cmpld	cr5, rWORD8_SHIFT, rSTR2
613	add	rN, rN, r12
614	sldi	rWORD6, r12, 3
615	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
616	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
617	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
618	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
619	subfic	rSHR, rSHL, 64
620	srdi	r0, rN, 5	/* Divide by 32 */
621	andi.	r12, rN, 24	/* Get the DW remainder */
622/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
623   this special case those bits may be discarded anyway.  Also we
624   must avoid loading a DW where none of the bits are part of rSTR2 as
625   this may cross a page boundary and cause a page fault.  */
626	li	rWORD8, 0
627	blt	cr5, L(dus0)
628	LD	rWORD8, 0, rSTR2
629	addi	rSTR2, rSTR2, 8
630	sld	rWORD8, rWORD8, rSHL
631
632L(dus0):
633	LD	rWORD1, 0, rSTR1
634	LD	rWORD2, 0, rSTR2
635	cmpldi	cr1, r12, 16
636	cmpldi	cr7, rN, 32
637	srd	r12, rWORD2, rSHR
638	clrldi	rN, rN, 61
639	beq	L(duPs4)
640	mtctr	r0
641	or	rWORD8, r12, rWORD8
642	bgt	cr1, L(duPs3)
643	beq	cr1, L(duPs2)
644
645/* Remainder is 8 */
646	.align	4
647L(dusP1):
648	sld	rWORD8_SHIFT, rWORD2, rSHL
649	sld	rWORD7, rWORD1, rWORD6
650	sld	rWORD8, rWORD8, rWORD6
651	bge	cr7, L(duP1e)
652/* At this point we exit early with the first double word compare
653   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
654   how we handle the remaining bytes.  */
655	cmpld	cr5, rWORD7, rWORD8
656	sldi.	rN, rN, 3
657	bne	cr5, L(duLcr5)
658	cmpld	cr7, rN, rSHR
659	beq	L(duZeroReturn)
660	li	r0, 0
661	ble	cr7, L(dutrim)
662	LD	rWORD2, rOFF8, rSTR2
663	srd	r0, rWORD2, rSHR
664	b	L(dutrim)
665/* Remainder is 16 */
666	.align	4
667L(duPs2):
668	sld	rWORD6_SHIFT, rWORD2, rSHL
669	sld	rWORD5, rWORD1, rWORD6
670	sld	rWORD6, rWORD8, rWORD6
671	b	L(duP2e)
672/* Remainder is 24 */
673	.align	4
674L(duPs3):
675	sld	rWORD4_SHIFT, rWORD2, rSHL
676	sld	rWORD3, rWORD1, rWORD6
677	sld	rWORD4, rWORD8, rWORD6
678	b	L(duP3e)
679/* Count is a multiple of 32, remainder is 0 */
680	.align	4
681L(duPs4):
682	mtctr	r0
683	or	rWORD8, r12, rWORD8
684	sld	rWORD2_SHIFT, rWORD2, rSHL
685	sld	rWORD1, rWORD1, rWORD6
686	sld	rWORD2, rWORD8, rWORD6
687	b	L(duP4e)
688
689/* At this point we know rSTR1 is double word aligned and the
690   compare length is at least 8 bytes.  */
691	.align	4
692L(DWunaligned):
693	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
694	clrrdi	rSTR2, rSTR2, 3
695	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
696	srdi	r0, rN, 5	/* Divide by 32 */
697	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
698	andi.	r12, rN, 24	/* Get the DW remainder */
699	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
700	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
701	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
702	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
703	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
704	sldi	rSHL, rSHL, 3
705	LD	rWORD6, 0, rSTR2
706	LD	rWORD8, rOFF8, rSTR2
707	addi	rSTR2, rSTR2, 8
708	cmpldi	cr1, r12, 16
709	cmpldi	cr7, rN, 32
710	clrldi	rN, rN, 61
711	subfic	rSHR, rSHL, 64
712	sld	rWORD6_SHIFT, rWORD6, rSHL
713	beq	L(duP4)
714	mtctr	r0
715	bgt	cr1, L(duP3)
716	beq	cr1, L(duP2)
717
718/* Remainder is 8 */
719	.align	4
720L(duP1):
721	srd	r12, rWORD8, rSHR
722	LD	rWORD7, 0, rSTR1
723	sld	rWORD8_SHIFT, rWORD8, rSHL
724	or	rWORD8, r12, rWORD6_SHIFT
725	blt	cr7, L(duP1x)
726L(duP1e):
727	LD	rWORD1, rOFF8, rSTR1
728	LD	rWORD2, rOFF8, rSTR2
729	cmpld	cr5, rWORD7, rWORD8
730	srd	r0, rWORD2, rSHR
731	sld	rWORD2_SHIFT, rWORD2, rSHL
732	or	rWORD2, r0, rWORD8_SHIFT
733	LD	rWORD3, rOFF16, rSTR1
734	LD	rWORD4, rOFF16, rSTR2
735	cmpld	cr7, rWORD1, rWORD2
736	srd	r12, rWORD4, rSHR
737	sld	rWORD4_SHIFT, rWORD4, rSHL
738	bne	cr5, L(duLcr5)
739	or	rWORD4, r12, rWORD2_SHIFT
740	LD	rWORD5, rOFF24, rSTR1
741	LD	rWORD6, rOFF24, rSTR2
742	cmpld	cr1, rWORD3, rWORD4
743	srd	r0, rWORD6, rSHR
744	sld	rWORD6_SHIFT, rWORD6, rSHL
745	bne	cr7, L(duLcr7)
746	or	rWORD6, r0, rWORD4_SHIFT
747	cmpld	cr6, rWORD5, rWORD6
748	b	L(duLoop3)
749	.align	4
750/* At this point we exit early with the first double word compare
751   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
752   how we handle the remaining bytes.  */
753L(duP1x):
754	cmpld	cr5, rWORD7, rWORD8
755	sldi.	rN, rN, 3
756	bne	cr5, L(duLcr5)
757	cmpld	cr7, rN, rSHR
758	beq	L(duZeroReturn)
759	li	r0, 0
760	ble	cr7, L(dutrim)
761	LD	rWORD2, rOFF8, rSTR2
762	srd	r0, rWORD2, rSHR
763	b	L(dutrim)
764/* Remainder is 16 */
765	.align	4
766L(duP2):
767	srd	r0, rWORD8, rSHR
768	LD	rWORD5, 0, rSTR1
769	or	rWORD6, r0, rWORD6_SHIFT
770	sld	rWORD6_SHIFT, rWORD8, rSHL
771L(duP2e):
772	LD	rWORD7, rOFF8, rSTR1
773	LD	rWORD8, rOFF8, rSTR2
774	cmpld	cr6, rWORD5, rWORD6
775	srd	r12, rWORD8, rSHR
776	sld	rWORD8_SHIFT, rWORD8, rSHL
777	or	rWORD8, r12, rWORD6_SHIFT
778	blt	cr7, L(duP2x)
779	LD	rWORD1, rOFF16, rSTR1
780	LD	rWORD2, rOFF16, rSTR2
781	cmpld	cr5, rWORD7, rWORD8
782	bne	cr6, L(duLcr6)
783	srd	r0, rWORD2, rSHR
784	sld	rWORD2_SHIFT, rWORD2, rSHL
785	or	rWORD2, r0, rWORD8_SHIFT
786	LD	rWORD3, rOFF24, rSTR1
787	LD	rWORD4, rOFF24, rSTR2
788	cmpld	cr7, rWORD1, rWORD2
789	bne	cr5, L(duLcr5)
790	srd	r12, rWORD4, rSHR
791	sld	rWORD4_SHIFT, rWORD4, rSHL
792	or	rWORD4, r12, rWORD2_SHIFT
793	addi	rSTR1, rSTR1, 8
794	addi	rSTR2, rSTR2, 8
795	cmpld	cr1, rWORD3, rWORD4
796	b	L(duLoop2)
797	.align	4
798L(duP2x):
799	cmpld	cr5, rWORD7, rWORD8
800	addi	rSTR1, rSTR1, 8
801	addi	rSTR2, rSTR2, 8
802	bne	cr6, L(duLcr6)
803	sldi.	rN, rN, 3
804	bne	cr5, L(duLcr5)
805	cmpld	cr7, rN, rSHR
806	beq	L(duZeroReturn)
807	li	r0, 0
808	ble	cr7, L(dutrim)
809	LD	rWORD2, rOFF8, rSTR2
810	srd	r0, rWORD2, rSHR
811	b	L(dutrim)
812
813/* Remainder is 24 */
814	.align	4
815L(duP3):
816	srd	r12, rWORD8, rSHR
817	LD	rWORD3, 0, rSTR1
818	sld	rWORD4_SHIFT, rWORD8, rSHL
819	or	rWORD4, r12, rWORD6_SHIFT
820L(duP3e):
821	LD	rWORD5, rOFF8, rSTR1
822	LD	rWORD6, rOFF8, rSTR2
823	cmpld	cr1, rWORD3, rWORD4
824	srd	r0, rWORD6, rSHR
825	sld	rWORD6_SHIFT, rWORD6, rSHL
826	or	rWORD6, r0, rWORD4_SHIFT
827	LD	rWORD7, rOFF16, rSTR1
828	LD	rWORD8, rOFF16, rSTR2
829	cmpld	cr6, rWORD5, rWORD6
830	bne	cr1, L(duLcr1)
831	srd	r12, rWORD8, rSHR
832	sld	rWORD8_SHIFT, rWORD8, rSHL
833	or	rWORD8, r12, rWORD6_SHIFT
834	blt	cr7, L(duP3x)
835	LD	rWORD1, rOFF24, rSTR1
836	LD	rWORD2, rOFF24, rSTR2
837	cmpld	cr5, rWORD7, rWORD8
838	bne	cr6, L(duLcr6)
839	srd	r0, rWORD2, rSHR
840	sld	rWORD2_SHIFT, rWORD2, rSHL
841	or	rWORD2, r0, rWORD8_SHIFT
842	addi	rSTR1, rSTR1, 16
843	addi	rSTR2, rSTR2, 16
844	cmpld	cr7, rWORD1, rWORD2
845	b	L(duLoop1)
846	.align	4
847L(duP3x):
848	addi	rSTR1, rSTR1, 16
849	addi	rSTR2, rSTR2, 16
850	cmpld	cr5, rWORD7, rWORD8
851	bne	cr6, L(duLcr6)
852	sldi.	rN, rN, 3
853	bne	cr5, L(duLcr5)
854	cmpld	cr7, rN, rSHR
855	beq	L(duZeroReturn)
856	li	r0, 0
857	ble	cr7, L(dutrim)
858	LD	rWORD2, rOFF8, rSTR2
859	srd	r0, rWORD2, rSHR
860	b	L(dutrim)
861
862/* Count is a multiple of 32, remainder is 0 */
863	.align	4
864L(duP4):
865	mtctr	r0
866	srd	r0, rWORD8, rSHR
867	LD	rWORD1, 0, rSTR1
868	sld	rWORD2_SHIFT, rWORD8, rSHL
869	or	rWORD2, r0, rWORD6_SHIFT
870L(duP4e):
871	LD	rWORD3, rOFF8, rSTR1
872	LD	rWORD4, rOFF8, rSTR2
873	cmpld	cr7, rWORD1, rWORD2
874	srd	r12, rWORD4, rSHR
875	sld	rWORD4_SHIFT, rWORD4, rSHL
876	or	rWORD4, r12, rWORD2_SHIFT
877	LD	rWORD5, rOFF16, rSTR1
878	LD	rWORD6, rOFF16, rSTR2
879	cmpld	cr1, rWORD3, rWORD4
880	bne	cr7, L(duLcr7)
881	srd	r0, rWORD6, rSHR
882	sld	rWORD6_SHIFT, rWORD6, rSHL
883	or	rWORD6, r0, rWORD4_SHIFT
884	LD	rWORD7, rOFF24, rSTR1
885	LD	rWORD8, rOFF24, rSTR2
886	addi	rSTR1, rSTR1, 24
887	addi	rSTR2, rSTR2, 24
888	cmpld	cr6, rWORD5, rWORD6
889	bne	cr1, L(duLcr1)
890	srd	r12, rWORD8, rSHR
891	sld	rWORD8_SHIFT, rWORD8, rSHL
892	or	rWORD8, r12, rWORD6_SHIFT
893	cmpld	cr5, rWORD7, rWORD8
894	bdz	L(du24)		/* Adjust CTR as we start with +4 */
895/* This is the primary loop */
896	.align	4
897L(duLoop):
898	LD	rWORD1, rOFF8, rSTR1
899	LD	rWORD2, rOFF8, rSTR2
900	cmpld	cr1, rWORD3, rWORD4
901	bne	cr6, L(duLcr6)
902	srd	r0, rWORD2, rSHR
903	sld	rWORD2_SHIFT, rWORD2, rSHL
904	or	rWORD2, r0, rWORD8_SHIFT
905L(duLoop1):
906	LD	rWORD3, rOFF16, rSTR1
907	LD	rWORD4, rOFF16, rSTR2
908	cmpld	cr6, rWORD5, rWORD6
909	bne	cr5, L(duLcr5)
910	srd	r12, rWORD4, rSHR
911	sld	rWORD4_SHIFT, rWORD4, rSHL
912	or	rWORD4, r12, rWORD2_SHIFT
913L(duLoop2):
914	LD	rWORD5, rOFF24, rSTR1
915	LD	rWORD6, rOFF24, rSTR2
916	cmpld	cr5, rWORD7, rWORD8
917	bne	cr7, L(duLcr7)
918	srd	r0, rWORD6, rSHR
919	sld	rWORD6_SHIFT, rWORD6, rSHL
920	or	rWORD6, r0, rWORD4_SHIFT
921L(duLoop3):
922	LD	rWORD7, rOFF32, rSTR1
923	LD	rWORD8, rOFF32, rSTR2
924	addi	rSTR1, rSTR1, 32
925	addi	rSTR2, rSTR2, 32
926	cmpld	cr7, rWORD1, rWORD2
927	bne	cr1, L(duLcr1)
928	srd	r12, rWORD8, rSHR
929	sld	rWORD8_SHIFT, rWORD8, rSHL
930	or	rWORD8, r12, rWORD6_SHIFT
931	bdnz	L(duLoop)
932
933L(duL4):
934	cmpld	cr1, rWORD3, rWORD4
935	bne	cr6, L(duLcr6)
936	cmpld	cr6, rWORD5, rWORD6
937	bne	cr5, L(duLcr5)
938	cmpld	cr5, rWORD7, rWORD8
939L(du44):
940	bne	cr7, L(duLcr7)
941L(du34):
942	bne	cr1, L(duLcr1)
943L(du24):
944	bne	cr6, L(duLcr6)
945L(du14):
946	sldi.	rN, rN, 3
947	bne	cr5, L(duLcr5)
948/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
949   shift right double to eliminate bits beyond the compare length.
950
951   However it may not be safe to load rWORD2 which may be beyond the
952   string length. So we compare the bit length of the remainder to
953   the right shift count (rSHR). If the bit count is less than or equal
954   we do not need to load rWORD2 (all significant bits are already in
955   rWORD8_SHIFT).  */
956	cmpld	cr7, rN, rSHR
957	beq	L(duZeroReturn)
958	li	r0, 0
959	ble	cr7, L(dutrim)
960	LD	rWORD2, rOFF8, rSTR2
961	srd	r0, rWORD2, rSHR
962	.align	4
963L(dutrim):
964	LD	rWORD1, rOFF8, rSTR1
965	ld	rWORD8, -8(r1)
966	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
967	or	rWORD2, r0, rWORD8_SHIFT
968	ld	rWORD7, rWORD7SAVE(r1)
969	ld	rSHL, rSHLSAVE(r1)
970	srd	rWORD1, rWORD1, rN
971	srd	rWORD2, rWORD2, rN
972	ld	rSHR, rSHRSAVE(r1)
973	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
974	li	rRTN, 0
975	cmpld	cr7, rWORD1, rWORD2
976	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
977	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
978	beq	cr7, L(dureturn24)
979	li	rRTN, 1
980	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
981	ld	rOFF8,  rOFF8SAVE(r1)
982	ld	rOFF16, rOFF16SAVE(r1)
983	ld	rOFF24, rOFF24SAVE(r1)
984	ld	rOFF32, rOFF32SAVE(r1)
985	bgtlr	cr7
986	li	rRTN, -1
987	blr
988	.align	4
989L(duLcr7):
990	ld	rWORD8, rWORD8SAVE(r1)
991	ld	rWORD7, rWORD7SAVE(r1)
992	li	rRTN, 1
993	bgt	cr7, L(dureturn29)
994	ld	rSHL, rSHLSAVE(r1)
995	ld	rSHR, rSHRSAVE(r1)
996	li	rRTN, -1
997	b	L(dureturn27)
998	.align	4
999L(duLcr1):
1000	ld	rWORD8, rWORD8SAVE(r1)
1001	ld	rWORD7, rWORD7SAVE(r1)
1002	li	rRTN, 1
1003	bgt	cr1, L(dureturn29)
1004	ld	rSHL, rSHLSAVE(r1)
1005	ld	rSHR, rSHRSAVE(r1)
1006	li	rRTN, -1
1007	b	L(dureturn27)
1008	.align	4
1009L(duLcr6):
1010	ld	rWORD8, rWORD8SAVE(r1)
1011	ld	rWORD7, rWORD7SAVE(r1)
1012	li	rRTN, 1
1013	bgt	cr6, L(dureturn29)
1014	ld	rSHL, rSHLSAVE(r1)
1015	ld	rSHR, rSHRSAVE(r1)
1016	li	rRTN, -1
1017	b	L(dureturn27)
1018	.align	4
1019L(duLcr5):
1020	ld	rWORD8, rWORD8SAVE(r1)
1021	ld	rWORD7, rWORD7SAVE(r1)
1022	li	rRTN, 1
1023	bgt	cr5, L(dureturn29)
1024	ld	rSHL, rSHLSAVE(r1)
1025	ld	rSHR, rSHRSAVE(r1)
1026	li	rRTN, -1
1027	b	L(dureturn27)
1028
1029	.align	3
1030L(duZeroReturn):
1031	li	rRTN, 0
1032	.align	4
1033L(dureturn):
1034	ld	rWORD8, rWORD8SAVE(r1)
1035	ld	rWORD7, rWORD7SAVE(r1)
1036L(dureturn29):
1037	ld	rSHL, rSHLSAVE(r1)
1038	ld	rSHR, rSHRSAVE(r1)
1039L(dureturn27):
1040	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1041	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1042	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1043L(dureturn24):
1044	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1045	ld	rOFF8,  rOFF8SAVE(r1)
1046	ld	rOFF16, rOFF16SAVE(r1)
1047	ld	rOFF24, rOFF24SAVE(r1)
1048	ld	rOFF32, rOFF32SAVE(r1)
1049	blr
1050
1051L(duzeroLength):
1052	ld	rOFF8,  rOFF8SAVE(r1)
1053	ld	rOFF16, rOFF16SAVE(r1)
1054	ld	rOFF24, rOFF24SAVE(r1)
1055	ld	rOFF32, rOFF32SAVE(r1)
1056	li	rRTN, 0
1057	blr
1058
1059END (MEMCMP)
1060libc_hidden_builtin_def (memcmp)
1061weak_alias (memcmp, bcmp)
1062strong_alias (memcmp, __memcmpeq)
1063libc_hidden_def (__memcmpeq)
1064