1/* memset optimized with AVX512 for KNL hardware. 2 Copyright (C) 2015-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#if IS_IN (libc) 22 23#include "asm-syntax.h" 24#ifndef MEMSET 25# define MEMSET __memset_avx512_no_vzeroupper 26# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper 27#endif 28 29 .section .text.avx512,"ax",@progbits 30#if defined PIC 31ENTRY (MEMSET_CHK) 32 cmp %RDX_LP, %RCX_LP 33 jb HIDDEN_JUMPTARGET (__chk_fail) 34END (MEMSET_CHK) 35#endif 36 37ENTRY (MEMSET) 38# ifdef __ILP32__ 39 /* Clear the upper 32 bits. */ 40 mov %edx, %edx 41# endif 42 vpxor %xmm0, %xmm0, %xmm0 43 vmovd %esi, %xmm1 44 lea (%rdi, %rdx), %rsi 45 mov %rdi, %rax 46 vpshufb %xmm0, %xmm1, %xmm0 47 cmp $16, %rdx 48 jb L(less_16bytes) 49 cmp $512, %rdx 50 vbroadcastss %xmm0, %zmm2 51 ja L(512bytesormore) 52 cmp $256, %rdx 53 jb L(less_256bytes) 54 vmovups %zmm2, (%rdi) 55 vmovups %zmm2, 0x40(%rdi) 56 vmovups %zmm2, 0x80(%rdi) 57 vmovups %zmm2, 0xC0(%rdi) 58 vmovups %zmm2, -0x100(%rsi) 59 vmovups %zmm2, -0xC0(%rsi) 60 vmovups %zmm2, -0x80(%rsi) 61 vmovups %zmm2, -0x40(%rsi) 62 ret 63 64L(less_256bytes): 65 cmp $128, %dl 66 jb L(less_128bytes) 67 vmovups %zmm2, (%rdi) 68 vmovups %zmm2, 0x40(%rdi) 69 vmovups %zmm2, -0x80(%rsi) 70 vmovups %zmm2, -0x40(%rsi) 71 ret 72 73L(less_128bytes): 74 cmp $64, %dl 75 jb L(less_64bytes) 76 vmovups %zmm2, (%rdi) 77 vmovups %zmm2, -0x40(%rsi) 78 ret 79 80L(less_64bytes): 81 cmp $32, %dl 82 jb L(less_32bytes) 83 vmovdqu %ymm2, (%rdi) 84 vmovdqu %ymm2, -0x20(%rsi) 85 ret 86 87L(less_32bytes): 88 vmovdqu %xmm0, (%rdi) 89 vmovdqu %xmm0, -0x10(%rsi) 90 ret 91 92L(less_16bytes): 93 cmp $8, %dl 94 jb L(less_8bytes) 95 vmovq %xmm0, (%rdi) 96 vmovq %xmm0, -0x08(%rsi) 97 ret 98 99L(less_8bytes): 100 vmovd %xmm0, %ecx 101 cmp $4, %dl 102 jb L(less_4bytes) 103 mov %ecx, (%rdi) 104 mov %ecx, -0x04(%rsi) 105 ret 106 107L(less_4bytes): 108 cmp $2, %dl 109 jb L(less_2bytes) 110 mov %cx, (%rdi) 111 mov %cx, -0x02(%rsi) 112 ret 113 114L(less_2bytes): 115 cmp $1, %dl 116 jb L(less_1bytes) 117 mov %cl, (%rdi) 118L(less_1bytes): 119 ret 120 121L(512bytesormore): 122 mov __x86_shared_cache_size_half(%rip), %rcx 123 cmp %rcx, %rdx 124 ja L(preloop_large) 125 cmp $1024, %rdx 126 ja L(1024bytesormore) 127 128 vmovups %zmm2, (%rdi) 129 vmovups %zmm2, 0x40(%rdi) 130 vmovups %zmm2, 0x80(%rdi) 131 vmovups %zmm2, 0xC0(%rdi) 132 vmovups %zmm2, 0x100(%rdi) 133 vmovups %zmm2, 0x140(%rdi) 134 vmovups %zmm2, 0x180(%rdi) 135 vmovups %zmm2, 0x1C0(%rdi) 136 vmovups %zmm2, -0x200(%rsi) 137 vmovups %zmm2, -0x1C0(%rsi) 138 vmovups %zmm2, -0x180(%rsi) 139 vmovups %zmm2, -0x140(%rsi) 140 vmovups %zmm2, -0x100(%rsi) 141 vmovups %zmm2, -0xC0(%rsi) 142 vmovups %zmm2, -0x80(%rsi) 143 vmovups %zmm2, -0x40(%rsi) 144 ret 145 146/* Align on 64 and loop with aligned stores. */ 147L(1024bytesormore): 148 sub $0x100, %rsi 149 vmovups %zmm2, (%rax) 150 and $-0x40, %rdi 151 add $0x40, %rdi 152 153L(gobble_256bytes_loop): 154 vmovaps %zmm2, (%rdi) 155 vmovaps %zmm2, 0x40(%rdi) 156 vmovaps %zmm2, 0x80(%rdi) 157 vmovaps %zmm2, 0xC0(%rdi) 158 add $0x100, %rdi 159 cmp %rsi, %rdi 160 jb L(gobble_256bytes_loop) 161 vmovups %zmm2, (%rsi) 162 vmovups %zmm2, 0x40(%rsi) 163 vmovups %zmm2, 0x80(%rsi) 164 vmovups %zmm2, 0xC0(%rsi) 165 ret 166 167/* Align on 128 and loop with non-temporal stores. */ 168L(preloop_large): 169 and $-0x80, %rdi 170 add $0x80, %rdi 171 vmovups %zmm2, (%rax) 172 vmovups %zmm2, 0x40(%rax) 173 sub $0x200, %rsi 174 175L(gobble_512bytes_nt_loop): 176 vmovntdq %zmm2, (%rdi) 177 vmovntdq %zmm2, 0x40(%rdi) 178 vmovntdq %zmm2, 0x80(%rdi) 179 vmovntdq %zmm2, 0xC0(%rdi) 180 vmovntdq %zmm2, 0x100(%rdi) 181 vmovntdq %zmm2, 0x140(%rdi) 182 vmovntdq %zmm2, 0x180(%rdi) 183 vmovntdq %zmm2, 0x1C0(%rdi) 184 add $0x200, %rdi 185 cmp %rsi, %rdi 186 jb L(gobble_512bytes_nt_loop) 187 sfence 188 vmovups %zmm2, (%rsi) 189 vmovups %zmm2, 0x40(%rsi) 190 vmovups %zmm2, 0x80(%rsi) 191 vmovups %zmm2, 0xC0(%rsi) 192 vmovups %zmm2, 0x100(%rsi) 193 vmovups %zmm2, 0x140(%rsi) 194 vmovups %zmm2, 0x180(%rsi) 195 vmovups %zmm2, 0x1C0(%rsi) 196 ret 197END (MEMSET) 198#endif 199