1/* Optimized mempcpy implementation for POWER7. 2 Copyright (C) 2010-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]); 22 Returns 'dst' + 'len'. */ 23 24 .machine power7 25EALIGN (__mempcpy, 5, 0) 26 CALL_MCOUNT 27 28 stwu 1,-32(1) 29 cfi_adjust_cfa_offset(32) 30 stw 30,20(1) 31 cfi_offset(30,(20-32)) 32 stw 31,24(1) 33 mr 30,3 34 cmplwi cr1,5,31 35 neg 0,3 36 cfi_offset(31,-8) 37 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move 38 code. */ 39 40 andi. 11,3,7 /* Check alignment of DST. */ 41 clrlwi 10,4,29 /* Check alignment of SRC. */ 42 cmplw cr6,10,11 /* SRC and DST alignments match? */ 43 mr 12,4 44 mr 31,5 45 bne cr6,L(copy_GE_32_unaligned) 46 47 srwi 9,5,3 /* Number of full quadwords remaining. */ 48 49 beq L(copy_GE_32_aligned_cont) 50 51 clrlwi 0,0,29 52 mtcrf 0x01,0 53 subf 31,0,5 54 55 /* Get the SRC aligned to 8 bytes. */ 56 571: bf 31,2f 58 lbz 6,0(12) 59 addi 12,12,1 60 stb 6,0(3) 61 addi 3,3,1 622: bf 30,4f 63 lhz 6,0(12) 64 addi 12,12,2 65 sth 6,0(3) 66 addi 3,3,2 674: bf 29,0f 68 lwz 6,0(12) 69 addi 12,12,4 70 stw 6,0(3) 71 addi 3,3,4 720: 73 clrlwi 10,12,29 /* Check alignment of SRC again. */ 74 srwi 9,31,3 /* Number of full doublewords remaining. */ 75 76L(copy_GE_32_aligned_cont): 77 78 clrlwi 11,31,29 79 mtcrf 0x01,9 80 81 srwi 8,31,5 82 cmplwi cr1,9,4 83 cmplwi cr6,11,0 84 mr 11,12 85 86 /* Copy 1~3 doublewords so the main loop starts 87 at a multiple of 32 bytes. */ 88 89 bf 30,1f 90 lfd 6,0(12) 91 lfd 7,8(12) 92 addi 11,12,16 93 mtctr 8 94 stfd 6,0(3) 95 stfd 7,8(3) 96 addi 10,3,16 97 bf 31,4f 98 lfd 0,16(12) 99 stfd 0,16(3) 100 blt cr1,3f 101 addi 11,12,24 102 addi 10,3,24 103 b 4f 104 105 .align 4 1061: /* Copy 1 doubleword and set the counter. */ 107 mr 10,3 108 mtctr 8 109 bf 31,4f 110 lfd 6,0(12) 111 addi 11,12,8 112 stfd 6,0(3) 113 addi 10,3,8 114 115 .align 4 1164: /* Main aligned copy loop. Copies 32-bytes at a time. */ 117 lfd 6,0(11) 118 lfd 7,8(11) 119 lfd 8,16(11) 120 lfd 0,24(11) 121 addi 11,11,32 122 123 stfd 6,0(10) 124 stfd 7,8(10) 125 stfd 8,16(10) 126 stfd 0,24(10) 127 addi 10,10,32 128 bdnz 4b 1293: 130 131 /* Check for tail bytes. */ 132 133 clrrwi 0,31,3 134 mtcrf 0x01,31 135 beq cr6,0f 136 137.L9: 138 add 3,3,0 139 add 12,12,0 140 141 /* At this point we have a tail of 0-7 bytes and we know that the 142 destination is doubleword-aligned. */ 1434: /* Copy 4 bytes. */ 144 bf 29,2f 145 146 lwz 6,0(12) 147 addi 12,12,4 148 stw 6,0(3) 149 addi 3,3,4 1502: /* Copy 2 bytes. */ 151 bf 30,1f 152 153 lhz 6,0(12) 154 addi 12,12,2 155 sth 6,0(3) 156 addi 3,3,2 1571: /* Copy 1 byte. */ 158 bf 31,0f 159 160 lbz 6,0(12) 161 stb 6,0(3) 1620: /* Return DST + LEN pointer. */ 163 add 3,30,5 164 lwz 30,20(1) 165 lwz 31,24(1) 166 addi 1,1,32 167 blr 168 169 /* Handle copies of 0~31 bytes. */ 170 .align 4 171L(copy_LT_32): 172 cmplwi cr6,5,8 173 mr 12,4 174 mtcrf 0x01,5 175 ble cr6,L(copy_LE_8) 176 177 /* At least 9 bytes to go. */ 178 neg 8,4 179 clrrwi 11,4,2 180 andi. 0,8,3 181 cmplwi cr1,5,16 182 mr 10,5 183 beq L(copy_LT_32_aligned) 184 185 /* Force 4-bytes alignment for SRC. */ 186 mtocrf 0x01,0 187 subf 10,0,5 1882: bf 30,1f 189 190 lhz 6,0(12) 191 addi 12,12,2 192 sth 6,0(3) 193 addi 3,3,2 1941: bf 31,L(end_4bytes_alignment) 195 196 lbz 6,0(12) 197 addi 12,12,1 198 stb 6,0(3) 199 addi 3,3,1 200 201 .align 4 202L(end_4bytes_alignment): 203 cmplwi cr1,10,16 204 mtcrf 0x01,10 205 206L(copy_LT_32_aligned): 207 /* At least 6 bytes to go, and SRC is word-aligned. */ 208 blt cr1,8f 209 210 /* Copy 16 bytes. */ 211 lwz 6,0(12) 212 lwz 7,4(12) 213 stw 6,0(3) 214 lwz 8,8(12) 215 stw 7,4(3) 216 lwz 6,12(12) 217 addi 12,12,16 218 stw 8,8(3) 219 stw 6,12(3) 220 addi 3,3,16 2218: /* Copy 8 bytes. */ 222 bf 28,4f 223 224 lwz 6,0(12) 225 lwz 7,4(12) 226 addi 12,12,8 227 stw 6,0(3) 228 stw 7,4(3) 229 addi 3,3,8 2304: /* Copy 4 bytes. */ 231 bf 29,2f 232 233 lwz 6,0(12) 234 addi 12,12,4 235 stw 6,0(3) 236 addi 3,3,4 2372: /* Copy 2-3 bytes. */ 238 bf 30,1f 239 240 lhz 6,0(12) 241 sth 6,0(3) 242 bf 31,0f 243 lbz 7,2(12) 244 stb 7,2(3) 245 246 /* Return DST + LEN pointer. */ 247 add 3,30,5 248 lwz 30,20(1) 249 addi 1,1,32 250 blr 251 252 .align 4 2531: /* Copy 1 byte. */ 254 bf 31,0f 255 256 lbz 6,0(12) 257 stb 6,0(3) 2580: /* Return DST + LEN pointer. */ 259 add 3,30,5 260 lwz 30,20(1) 261 addi 1,1,32 262 blr 263 264 /* Handles copies of 0~8 bytes. */ 265 .align 4 266L(copy_LE_8): 267 bne cr6,4f 268 269 /* Though we could've used lfd/stfd here, they are still 270 slow for unaligned cases. */ 271 272 lwz 6,0(4) 273 lwz 7,4(4) 274 stw 6,0(3) 275 stw 7,4(3) 276 277 /* Return DST + LEN pointer. */ 278 add 3,30,5 279 lwz 30,20(1) 280 addi 1,1,32 281 blr 282 283 .align 4 2844: /* Copies 4~7 bytes. */ 285 bf 29,2b 286 287 lwz 6,0(4) 288 stw 6,0(3) 289 bf 30,5f 290 lhz 7,4(4) 291 sth 7,4(3) 292 bf 31,0f 293 lbz 8,6(4) 294 stb 8,6(3) 295 296 /* Return DST + LEN pointer. */ 297 add 3,30,5 298 lwz 30,20(1) 299 addi 1,1,32 300 blr 301 302 .align 4 3035: /* Copy 1 byte. */ 304 bf 31,0f 305 306 lbz 6,4(4) 307 stb 6,4(3) 308 3090: /* Return DST + LEN pointer. */ 310 add 3,30,5 311 lwz 30,20(1) 312 addi 1,1,32 313 blr 314 315 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but 316 SRC is not. Use aligned quadword loads from SRC, shifted to realign 317 the data, allowing for aligned DST stores. */ 318 .align 4 319L(copy_GE_32_unaligned): 320 andi. 11,3,15 /* Check alignment of DST. */ 321 clrlwi 0,0,28 /* Number of bytes until the 1st 322 quadword of DST. */ 323 srwi 9,5,4 /* Number of full quadwords remaining. */ 324 325 beq L(copy_GE_32_unaligned_cont) 326 327 /* DST is not quadword aligned, get it aligned. */ 328 329 mtcrf 0x01,0 330 subf 31,0,5 331 332 /* Vector instructions work best when proper alignment (16-bytes) 333 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ 3341: /* Copy 1 byte. */ 335 bf 31,2f 336 337 lbz 6,0(12) 338 addi 12,12,1 339 stb 6,0(3) 340 addi 3,3,1 3412: /* Copy 2 bytes. */ 342 bf 30,4f 343 344 lhz 6,0(12) 345 addi 12,12,2 346 sth 6,0(3) 347 addi 3,3,2 3484: /* Copy 4 bytes. */ 349 bf 29,8f 350 351 lwz 6,0(12) 352 addi 12,12,4 353 stw 6,0(3) 354 addi 3,3,4 3558: /* Copy 8 bytes. */ 356 bf 28,0f 357 358 lfd 6,0(12) 359 addi 12,12,8 360 stfd 6,0(3) 361 addi 3,3,8 3620: 363 clrlwi 10,12,28 /* Check alignment of SRC. */ 364 srwi 9,31,4 /* Number of full quadwords remaining. */ 365 366 /* The proper alignment is present, it is OK to copy the bytes now. */ 367L(copy_GE_32_unaligned_cont): 368 369 /* Setup two indexes to speed up the indexed vector operations. */ 370 clrlwi 11,31,28 371 li 6,16 /* Index for 16-bytes offsets. */ 372 li 7,32 /* Index for 32-bytes offsets. */ 373 cmplwi cr1,11,0 374 srwi 8,31,5 /* Setup the loop counter. */ 375 mr 10,3 376 mr 11,12 377 mtcrf 0x01,9 378 cmplwi cr6,9,1 379#ifdef __LITTLE_ENDIAN__ 380 lvsr 5,0,12 381#else 382 lvsl 5,0,12 383#endif 384 lvx 3,0,12 385 bf 31,L(setup_unaligned_loop) 386 387 /* Copy another 16 bytes to align to 32-bytes due to the loop . */ 388 lvx 4,12,6 389#ifdef __LITTLE_ENDIAN__ 390 vperm 6,4,3,5 391#else 392 vperm 6,3,4,5 393#endif 394 addi 11,12,16 395 addi 10,3,16 396 stvx 6,0,3 397 vor 3,4,4 398 399L(setup_unaligned_loop): 400 mtctr 8 401 ble cr6,L(end_unaligned_loop) 402 403 /* Copy 32 bytes at a time using vector instructions. */ 404 .align 4 405L(unaligned_loop): 406 407 /* Note: vr6/vr10 may contain data that was already copied, 408 but in order to get proper alignment, we may have to copy 409 some portions again. This is faster than having unaligned 410 vector instructions though. */ 411 412 lvx 4,11,6 /* vr4 = r11+16. */ 413#ifdef __LITTLE_ENDIAN__ 414 vperm 6,4,3,5 415#else 416 vperm 6,3,4,5 417#endif 418 lvx 3,11,7 /* vr3 = r11+32. */ 419#ifdef __LITTLE_ENDIAN__ 420 vperm 10,3,4,5 421#else 422 vperm 10,4,3,5 423#endif 424 addi 11,11,32 425 stvx 6,0,10 426 stvx 10,10,6 427 addi 10,10,32 428 429 bdnz L(unaligned_loop) 430 431 .align 4 432L(end_unaligned_loop): 433 434 /* Check for tail bytes. */ 435 clrrwi 0,31,4 436 mtcrf 0x01,31 437 beq cr1,0f 438 439 add 3,3,0 440 add 12,12,0 441 442 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ 4438: /* Copy 8 bytes. */ 444 bf 28,4f 445 446 lwz 6,0(12) 447 lwz 7,4(12) 448 addi 12,12,8 449 stw 6,0(3) 450 stw 7,4(3) 451 addi 3,3,8 4524: /* Copy 4 bytes. */ 453 bf 29,2f 454 455 lwz 6,0(12) 456 addi 12,12,4 457 stw 6,0(3) 458 addi 3,3,4 4592: /* Copy 2~3 bytes. */ 460 bf 30,1f 461 462 lhz 6,0(12) 463 addi 12,12,2 464 sth 6,0(3) 465 addi 3,3,2 4661: /* Copy 1 byte. */ 467 bf 31,0f 468 469 lbz 6,0(12) 470 stb 6,0(3) 4710: /* Return DST + LEN pointer. */ 472 add 3,30,5 473 lwz 30,20(1) 474 lwz 31,24(1) 475 addi 1,1,32 476 blr 477 478END (__mempcpy) 479libc_hidden_def (__mempcpy) 480weak_alias (__mempcpy, mempcpy) 481libc_hidden_builtin_def (mempcpy) 482