1/* Optimized memset implementation for POWER10 LE.
2   Copyright (C) 2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22   Returns 's'.  */
23
24#ifndef MEMSET
25# define MEMSET memset
26#endif
27
28	.machine  power9
29ENTRY_TOCLESS (MEMSET, 5)
30	CALL_MCOUNT 3
31
32L(_memset):
33	/* Assume memset of zero length is uncommon, and just let it go
34	   through the small path below.  */
35	cmpldi	r5,64
36
37	/* Replicate byte to quad word.  */
38	mtvsrd	v0+32,r4
39	vspltb	v0,v0,7
40
41	li	r7,16
42	sldi	r8,r7,56
43
44	bgt	L(large)
45
46	/* For short lengths we want to avoid as many branches as possible.
47	   We use store VSX vector with length instructions to do this.
48	   It takes advantage of the fact that if the length passed to stxvl
49	   is zero nothing is done, effectively a no-op.  */
50	sldi	r5,r5,56
51
52	addi	r10,r3,16
53
54	sub.	r11,r5,r8
55	isellt	r11,0,r11	/* Saturate the subtraction to zero.  */
56
57	stxvl	v0+32,r3,r5
58	stxvl	v0+32,r10,r11
59
60	addi	r9,r3,32
61	addi	r10,r3,48
62
63	sub.	r11,r11,r8
64	isellt	r11,0,r11
65
66	sub.	r5,r11,r8
67	isellt	r5,0,r5
68
69	stxvl	v0+32,r9,r11
70	stxvl	v0+32,r10,r5
71
72	blr
73
74	.balign	16
75L(large):
76	mr	r6,r3	/* Don't modify r3 since we need to return it.  */
77
78	/* Get dest 16B aligned.  */
79	neg	r0,r3
80	clrldi.	r7,r0,(64-4)
81	beq	L(aligned)
82	rldic	r9,r0,56,4	/* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56".  */
83
84	stxvl	v0+32,r6,r9	/* Store up to 15B until aligned address.  */
85
86	add	r6,r6,r7
87	sub	r5,r5,r7
88
89	/* Go to tail if there is less than 64B left after alignment.  */
90	cmpldi	r5,64
91	blt	L(tail_64)
92
93	.balign	16
94L(aligned):
95	/* Go to tail if there is less than 128B left after alignment.  */
96	srdi.	r0,r5,7
97	beq	L(tail_128)
98
99	/* If c == 0 && n >= 256 use dcbz to zero out full cache blocks.  */
100	cmpldi	cr5,r5,255
101	cmpldi	cr6,r4,0
102	crand	27,26,21
103	bt	27,L(dcbz)
104
105	mtctr	r0
106
107	.balign	32
108L(loop):
109	stxv	v0+32,0(r6)
110	stxv	v0+32,16(r6)
111	stxv	v0+32,32(r6)
112	stxv	v0+32,48(r6)
113	stxv	v0+32,64(r6)
114	stxv	v0+32,80(r6)
115	stxv	v0+32,96(r6)
116	stxv	v0+32,112(r6)
117	addi	r6,r6,128
118	bdnz	L(loop)
119
120	.balign	16
121L(tail):
122	/* 127B or less left, finish the tail or return.  */
123	andi.	r5,r5,127
124	beqlr
125
126	cmpldi	r5,64
127	blt	L(tail_64)
128
129	.balign	16
130L(tail_128):
131	/* Stores a minimum of 64B and up to 128B and return.  */
132	stxv	v0+32,0(r6)
133	stxv	v0+32,16(r6)
134	stxv	v0+32,32(r6)
135	stxv	v0+32,48(r6)
136	addi	r6,r6,64
137	andi.	r5,r5,63
138	beqlr
139
140	.balign	16
141L(tail_64):
142	/* Stores up to 64B and return.  */
143	sldi	r5,r5,56
144
145	addi	r10,r6,16
146
147	sub.	r11,r5,r8
148	isellt	r11,0,r11
149
150	stxvl	v0+32,r6,r5
151	stxvl	v0+32,r10,r11
152
153	sub.	r11,r11,r8
154	blelr
155
156	addi	r9,r6,32
157	addi	r10,r6,48
158
159	isellt	r11,0,r11
160
161	sub.	r5,r11,r8
162	isellt	r5,0,r5
163
164	stxvl	v0+32,r9,r11
165	stxvl	v0+32,r10,r5
166
167	blr
168
169	.balign	16
170L(dcbz):
171	/* Special case when value is 0 and we have a long length to deal
172	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
173	   Before using dcbz though, we need to get the destination 128-byte
174	   aligned.  */
175	neg	r0,r6
176	clrldi.	r0,r0,(64-7)
177	beq	L(dcbz_aligned)
178
179	sub	r5,r5,r0
180	mtocrf	0x2,r0	/* copying bits 57..59 to cr6. The ones for sizes 64,
181			   32 and 16 which need to be checked.  */
182
183	/* Write 16-128 bytes until DST is aligned to 128 bytes.  */
18464:	bf	25,32f
185	stxv	v0+32,0(r6)
186	stxv	v0+32,16(r6)
187	stxv	v0+32,32(r6)
188	stxv	v0+32,48(r6)
189	addi	r6,r6,64
190
19132:	bf	26,16f
192	stxv	v0+32,0(r6)
193	stxv	v0+32,16(r6)
194	addi	r6,r6,32
195
19616:	bf	27,L(dcbz_aligned)
197	stxv	v0+32,0(r6)
198	addi	r6,r6,16
199
200	.balign	16
201L(dcbz_aligned):
202	/* Setup dcbz unroll offsets and count numbers.  */
203	srdi.	r0,r5,9
204	li	r9,128
205	beq	L(bcdz_tail)
206	li	r10,256
207	li	r11,384
208	mtctr	r0
209
210	.balign	16
211L(dcbz_loop):
212	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
213	   a throughput boost for large sizes (2048 bytes or higher).  */
214	dcbz	0,r6
215	dcbz	r9,r6
216	dcbz	r10,r6
217	dcbz	r11,r6
218	addi	r6,r6,512
219	bdnz	L(dcbz_loop)
220
221	andi.	r5,r5,511
222	beqlr
223
224	.balign	16
225L(bcdz_tail):
226	/* We have 1-511 bytes remaining.  */
227	srdi.	r0,r5,7
228	beq	L(tail)
229
230	mtocrf	0x1,r0
231
232256:	bf	30,128f
233	dcbz	0,r6
234	dcbz	r9,r6
235	addi	r6,r6,256
236
237128:	bf	31,L(tail)
238	dcbz	0,r6
239	addi	r6,r6,128
240
241	b	L(tail)
242
243END_GEN_TB (MEMSET,TB_TOCLESS)
244libc_hidden_builtin_def (memset)
245
246/* Copied from bzero.S to prevent the linker from inserting a stub
247   between bzero and memset.  */
248ENTRY_TOCLESS (__bzero)
249	CALL_MCOUNT 2
250	mr	r5,r4
251	li	r4,0
252	b	L(_memset)
253END (__bzero)
254#ifndef __bzero
255weak_alias (__bzero, bzero)
256#endif
257