1/* Optimized memset implementation for POWER10 LE. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); 22 Returns 's'. */ 23 24#ifndef MEMSET 25# define MEMSET memset 26#endif 27 28 .machine power9 29ENTRY_TOCLESS (MEMSET, 5) 30 CALL_MCOUNT 3 31 32L(_memset): 33 /* Assume memset of zero length is uncommon, and just let it go 34 through the small path below. */ 35 cmpldi r5,64 36 37 /* Replicate byte to quad word. */ 38 mtvsrd v0+32,r4 39 vspltb v0,v0,7 40 41 li r7,16 42 sldi r8,r7,56 43 44 bgt L(large) 45 46 /* For short lengths we want to avoid as many branches as possible. 47 We use store VSX vector with length instructions to do this. 48 It takes advantage of the fact that if the length passed to stxvl 49 is zero nothing is done, effectively a no-op. */ 50 sldi r5,r5,56 51 52 addi r10,r3,16 53 54 sub. r11,r5,r8 55 isellt r11,0,r11 /* Saturate the subtraction to zero. */ 56 57 stxvl v0+32,r3,r5 58 stxvl v0+32,r10,r11 59 60 addi r9,r3,32 61 addi r10,r3,48 62 63 sub. r11,r11,r8 64 isellt r11,0,r11 65 66 sub. r5,r11,r8 67 isellt r5,0,r5 68 69 stxvl v0+32,r9,r11 70 stxvl v0+32,r10,r5 71 72 blr 73 74 .balign 16 75L(large): 76 mr r6,r3 /* Don't modify r3 since we need to return it. */ 77 78 /* Get dest 16B aligned. */ 79 neg r0,r3 80 clrldi. r7,r0,(64-4) 81 beq L(aligned) 82 rldic r9,r0,56,4 /* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56". */ 83 84 stxvl v0+32,r6,r9 /* Store up to 15B until aligned address. */ 85 86 add r6,r6,r7 87 sub r5,r5,r7 88 89 /* Go to tail if there is less than 64B left after alignment. */ 90 cmpldi r5,64 91 blt L(tail_64) 92 93 .balign 16 94L(aligned): 95 /* Go to tail if there is less than 128B left after alignment. */ 96 srdi. r0,r5,7 97 beq L(tail_128) 98 99 /* If c == 0 && n >= 256 use dcbz to zero out full cache blocks. */ 100 cmpldi cr5,r5,255 101 cmpldi cr6,r4,0 102 crand 27,26,21 103 bt 27,L(dcbz) 104 105 mtctr r0 106 107 .balign 32 108L(loop): 109 stxv v0+32,0(r6) 110 stxv v0+32,16(r6) 111 stxv v0+32,32(r6) 112 stxv v0+32,48(r6) 113 stxv v0+32,64(r6) 114 stxv v0+32,80(r6) 115 stxv v0+32,96(r6) 116 stxv v0+32,112(r6) 117 addi r6,r6,128 118 bdnz L(loop) 119 120 .balign 16 121L(tail): 122 /* 127B or less left, finish the tail or return. */ 123 andi. r5,r5,127 124 beqlr 125 126 cmpldi r5,64 127 blt L(tail_64) 128 129 .balign 16 130L(tail_128): 131 /* Stores a minimum of 64B and up to 128B and return. */ 132 stxv v0+32,0(r6) 133 stxv v0+32,16(r6) 134 stxv v0+32,32(r6) 135 stxv v0+32,48(r6) 136 addi r6,r6,64 137 andi. r5,r5,63 138 beqlr 139 140 .balign 16 141L(tail_64): 142 /* Stores up to 64B and return. */ 143 sldi r5,r5,56 144 145 addi r10,r6,16 146 147 sub. r11,r5,r8 148 isellt r11,0,r11 149 150 stxvl v0+32,r6,r5 151 stxvl v0+32,r10,r11 152 153 sub. r11,r11,r8 154 blelr 155 156 addi r9,r6,32 157 addi r10,r6,48 158 159 isellt r11,0,r11 160 161 sub. r5,r11,r8 162 isellt r5,0,r5 163 164 stxvl v0+32,r9,r11 165 stxvl v0+32,r10,r5 166 167 blr 168 169 .balign 16 170L(dcbz): 171 /* Special case when value is 0 and we have a long length to deal 172 with. Use dcbz to zero out a full cacheline of 128 bytes at a time. 173 Before using dcbz though, we need to get the destination 128-byte 174 aligned. */ 175 neg r0,r6 176 clrldi. r0,r0,(64-7) 177 beq L(dcbz_aligned) 178 179 sub r5,r5,r0 180 mtocrf 0x2,r0 /* copying bits 57..59 to cr6. The ones for sizes 64, 181 32 and 16 which need to be checked. */ 182 183 /* Write 16-128 bytes until DST is aligned to 128 bytes. */ 18464: bf 25,32f 185 stxv v0+32,0(r6) 186 stxv v0+32,16(r6) 187 stxv v0+32,32(r6) 188 stxv v0+32,48(r6) 189 addi r6,r6,64 190 19132: bf 26,16f 192 stxv v0+32,0(r6) 193 stxv v0+32,16(r6) 194 addi r6,r6,32 195 19616: bf 27,L(dcbz_aligned) 197 stxv v0+32,0(r6) 198 addi r6,r6,16 199 200 .balign 16 201L(dcbz_aligned): 202 /* Setup dcbz unroll offsets and count numbers. */ 203 srdi. r0,r5,9 204 li r9,128 205 beq L(bcdz_tail) 206 li r10,256 207 li r11,384 208 mtctr r0 209 210 .balign 16 211L(dcbz_loop): 212 /* Sets 512 bytes to zero in each iteration, the loop unrolling shows 213 a throughput boost for large sizes (2048 bytes or higher). */ 214 dcbz 0,r6 215 dcbz r9,r6 216 dcbz r10,r6 217 dcbz r11,r6 218 addi r6,r6,512 219 bdnz L(dcbz_loop) 220 221 andi. r5,r5,511 222 beqlr 223 224 .balign 16 225L(bcdz_tail): 226 /* We have 1-511 bytes remaining. */ 227 srdi. r0,r5,7 228 beq L(tail) 229 230 mtocrf 0x1,r0 231 232256: bf 30,128f 233 dcbz 0,r6 234 dcbz r9,r6 235 addi r6,r6,256 236 237128: bf 31,L(tail) 238 dcbz 0,r6 239 addi r6,r6,128 240 241 b L(tail) 242 243END_GEN_TB (MEMSET,TB_TOCLESS) 244libc_hidden_builtin_def (memset) 245 246/* Copied from bzero.S to prevent the linker from inserting a stub 247 between bzero and memset. */ 248ENTRY_TOCLESS (__bzero) 249 CALL_MCOUNT 2 250 mr r5,r4 251 li r4,0 252 b L(_memset) 253END (__bzero) 254#ifndef __bzero 255weak_alias (__bzero, bzero) 256#endif 257