1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- --------------------------------------------------------
3 *
4 * Copyright (C) 2016 Intel Corporation
5 *
6 * Author: Gayatri Kammela <gayatri.kammela@intel.com>
7 * Author: Megha Dey <megha.dey@linux.intel.com>
8 *
9 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11 *
12 * -----------------------------------------------------------------------
13 */
14
15 /*
16 * AVX512 implementation of RAID-6 syndrome functions
17 *
18 */
19
20 #ifdef CONFIG_AS_AVX512
21
22 #include <linux/raid/pq.h>
23 #include "x86.h"
24
25 static const struct raid6_avx512_constants {
26 u64 x1d[8];
27 } raid6_avx512_constants __aligned(512/8) = {
28 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
30 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32 };
33
raid6_have_avx512(void)34 static int raid6_have_avx512(void)
35 {
36 return boot_cpu_has(X86_FEATURE_AVX2) &&
37 boot_cpu_has(X86_FEATURE_AVX) &&
38 boot_cpu_has(X86_FEATURE_AVX512F) &&
39 boot_cpu_has(X86_FEATURE_AVX512BW) &&
40 boot_cpu_has(X86_FEATURE_AVX512VL) &&
41 boot_cpu_has(X86_FEATURE_AVX512DQ);
42 }
43
raid6_avx5121_gen_syndrome(int disks,size_t bytes,void ** ptrs)44 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
45 {
46 u8 **dptr = (u8 **)ptrs;
47 u8 *p, *q;
48 int d, z, z0;
49
50 z0 = disks - 3; /* Highest data disk */
51 p = dptr[z0+1]; /* XOR parity */
52 q = dptr[z0+2]; /* RS syndrome */
53
54 kernel_fpu_begin();
55
56 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
57 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
58 :
59 : "m" (raid6_avx512_constants.x1d[0]));
60
61 for (d = 0; d < bytes; d += 64) {
62 asm volatile("prefetchnta %0\n\t"
63 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
64 "prefetchnta %1\n\t"
65 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
66 "vmovdqa64 %1,%%zmm6"
67 :
68 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
69 for (z = z0-2; z >= 0; z--) {
70 asm volatile("prefetchnta %0\n\t"
71 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
72 "vpmovm2b %%k1,%%zmm5\n\t"
73 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
74 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
75 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
76 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
77 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
78 "vmovdqa64 %0,%%zmm6"
79 :
80 : "m" (dptr[z][d]));
81 }
82 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
83 "vpmovm2b %%k1,%%zmm5\n\t"
84 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
85 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
86 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
87 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
88 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
89 "vmovntdq %%zmm2,%0\n\t"
90 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
91 "vmovntdq %%zmm4,%1\n\t"
92 "vpxorq %%zmm4,%%zmm4,%%zmm4"
93 :
94 : "m" (p[d]), "m" (q[d]));
95 }
96
97 asm volatile("sfence" : : : "memory");
98 kernel_fpu_end();
99 }
100
raid6_avx5121_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)101 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
102 size_t bytes, void **ptrs)
103 {
104 u8 **dptr = (u8 **)ptrs;
105 u8 *p, *q;
106 int d, z, z0;
107
108 z0 = stop; /* P/Q right side optimization */
109 p = dptr[disks-2]; /* XOR parity */
110 q = dptr[disks-1]; /* RS syndrome */
111
112 kernel_fpu_begin();
113
114 asm volatile("vmovdqa64 %0,%%zmm0"
115 : : "m" (raid6_avx512_constants.x1d[0]));
116
117 for (d = 0 ; d < bytes ; d += 64) {
118 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
119 "vmovdqa64 %1,%%zmm2\n\t"
120 "vpxorq %%zmm4,%%zmm2,%%zmm2"
121 :
122 : "m" (dptr[z0][d]), "m" (p[d]));
123 /* P/Q data pages */
124 for (z = z0-1 ; z >= start ; z--) {
125 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
126 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
127 "vpmovm2b %%k1,%%zmm5\n\t"
128 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
129 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
130 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
131 "vmovdqa64 %0,%%zmm5\n\t"
132 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
133 "vpxorq %%zmm5,%%zmm4,%%zmm4"
134 :
135 : "m" (dptr[z][d]));
136 }
137 /* P/Q left side optimization */
138 for (z = start-1 ; z >= 0 ; z--) {
139 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
140 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
141 "vpmovm2b %%k1,%%zmm5\n\t"
142 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
143 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
144 "vpxorq %%zmm5,%%zmm4,%%zmm4"
145 :
146 : );
147 }
148 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
149 /* Don't use movntdq for r/w memory area < cache line */
150 "vmovdqa64 %%zmm4,%0\n\t"
151 "vmovdqa64 %%zmm2,%1"
152 :
153 : "m" (q[d]), "m" (p[d]));
154 }
155
156 asm volatile("sfence" : : : "memory");
157 kernel_fpu_end();
158 }
159
160 const struct raid6_calls raid6_avx512x1 = {
161 raid6_avx5121_gen_syndrome,
162 raid6_avx5121_xor_syndrome,
163 raid6_have_avx512,
164 "avx512x1",
165 1 /* Has cache hints */
166 };
167
168 /*
169 * Unrolled-by-2 AVX512 implementation
170 */
raid6_avx5122_gen_syndrome(int disks,size_t bytes,void ** ptrs)171 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
172 {
173 u8 **dptr = (u8 **)ptrs;
174 u8 *p, *q;
175 int d, z, z0;
176
177 z0 = disks - 3; /* Highest data disk */
178 p = dptr[z0+1]; /* XOR parity */
179 q = dptr[z0+2]; /* RS syndrome */
180
181 kernel_fpu_begin();
182
183 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
184 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
185 :
186 : "m" (raid6_avx512_constants.x1d[0]));
187
188 /* We uniformly assume a single prefetch covers at least 64 bytes */
189 for (d = 0; d < bytes; d += 128) {
190 asm volatile("prefetchnta %0\n\t"
191 "prefetchnta %1\n\t"
192 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
193 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
194 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
195 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
196 :
197 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
198 for (z = z0-1; z >= 0; z--) {
199 asm volatile("prefetchnta %0\n\t"
200 "prefetchnta %1\n\t"
201 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
202 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
203 "vpmovm2b %%k1,%%zmm5\n\t"
204 "vpmovm2b %%k2,%%zmm7\n\t"
205 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
206 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
207 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
208 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
209 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
210 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
211 "vmovdqa64 %0,%%zmm5\n\t"
212 "vmovdqa64 %1,%%zmm7\n\t"
213 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
214 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
215 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
216 "vpxorq %%zmm7,%%zmm6,%%zmm6"
217 :
218 : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
219 }
220 asm volatile("vmovntdq %%zmm2,%0\n\t"
221 "vmovntdq %%zmm3,%1\n\t"
222 "vmovntdq %%zmm4,%2\n\t"
223 "vmovntdq %%zmm6,%3"
224 :
225 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
226 "m" (q[d+64]));
227 }
228
229 asm volatile("sfence" : : : "memory");
230 kernel_fpu_end();
231 }
232
raid6_avx5122_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)233 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
234 size_t bytes, void **ptrs)
235 {
236 u8 **dptr = (u8 **)ptrs;
237 u8 *p, *q;
238 int d, z, z0;
239
240 z0 = stop; /* P/Q right side optimization */
241 p = dptr[disks-2]; /* XOR parity */
242 q = dptr[disks-1]; /* RS syndrome */
243
244 kernel_fpu_begin();
245
246 asm volatile("vmovdqa64 %0,%%zmm0"
247 : : "m" (raid6_avx512_constants.x1d[0]));
248
249 for (d = 0 ; d < bytes ; d += 128) {
250 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
251 "vmovdqa64 %1,%%zmm6\n\t"
252 "vmovdqa64 %2,%%zmm2\n\t"
253 "vmovdqa64 %3,%%zmm3\n\t"
254 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
255 "vpxorq %%zmm6,%%zmm3,%%zmm3"
256 :
257 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
258 "m" (p[d]), "m" (p[d+64]));
259 /* P/Q data pages */
260 for (z = z0-1 ; z >= start ; z--) {
261 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
262 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
263 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
264 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
265 "vpmovm2b %%k1,%%zmm5\n\t"
266 "vpmovm2b %%k2,%%zmm7\n\t"
267 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
268 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
269 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
270 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
271 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
272 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
273 "vmovdqa64 %0,%%zmm5\n\t"
274 "vmovdqa64 %1,%%zmm7\n\t"
275 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
276 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
277 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
278 "vpxorq %%zmm7,%%zmm6,%%zmm6"
279 :
280 : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
281 }
282 /* P/Q left side optimization */
283 for (z = start-1 ; z >= 0 ; z--) {
284 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
285 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
286 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
287 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
288 "vpmovm2b %%k1,%%zmm5\n\t"
289 "vpmovm2b %%k2,%%zmm7\n\t"
290 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
291 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
292 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
293 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
294 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
295 "vpxorq %%zmm7,%%zmm6,%%zmm6"
296 :
297 : );
298 }
299 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
300 "vpxorq %1,%%zmm6,%%zmm6\n\t"
301 /* Don't use movntdq for r/w
302 * memory area < cache line
303 */
304 "vmovdqa64 %%zmm4,%0\n\t"
305 "vmovdqa64 %%zmm6,%1\n\t"
306 "vmovdqa64 %%zmm2,%2\n\t"
307 "vmovdqa64 %%zmm3,%3"
308 :
309 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
310 "m" (p[d+64]));
311 }
312
313 asm volatile("sfence" : : : "memory");
314 kernel_fpu_end();
315 }
316
317 const struct raid6_calls raid6_avx512x2 = {
318 raid6_avx5122_gen_syndrome,
319 raid6_avx5122_xor_syndrome,
320 raid6_have_avx512,
321 "avx512x2",
322 1 /* Has cache hints */
323 };
324
325 #ifdef CONFIG_X86_64
326
327 /*
328 * Unrolled-by-4 AVX2 implementation
329 */
raid6_avx5124_gen_syndrome(int disks,size_t bytes,void ** ptrs)330 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
331 {
332 u8 **dptr = (u8 **)ptrs;
333 u8 *p, *q;
334 int d, z, z0;
335
336 z0 = disks - 3; /* Highest data disk */
337 p = dptr[z0+1]; /* XOR parity */
338 q = dptr[z0+2]; /* RS syndrome */
339
340 kernel_fpu_begin();
341
342 asm volatile("vmovdqa64 %0,%%zmm0\n\t"
343 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
344 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
345 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
346 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
347 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
348 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
349 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
350 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
351 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
352 :
353 : "m" (raid6_avx512_constants.x1d[0]));
354
355 for (d = 0; d < bytes; d += 256) {
356 for (z = z0; z >= 0; z--) {
357 asm volatile("prefetchnta %0\n\t"
358 "prefetchnta %1\n\t"
359 "prefetchnta %2\n\t"
360 "prefetchnta %3\n\t"
361 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
362 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
363 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
364 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
365 "vpmovm2b %%k1,%%zmm5\n\t"
366 "vpmovm2b %%k2,%%zmm7\n\t"
367 "vpmovm2b %%k3,%%zmm13\n\t"
368 "vpmovm2b %%k4,%%zmm15\n\t"
369 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
370 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
371 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
372 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
373 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
374 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
375 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
376 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
377 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
378 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
379 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
380 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
381 "vmovdqa64 %0,%%zmm5\n\t"
382 "vmovdqa64 %1,%%zmm7\n\t"
383 "vmovdqa64 %2,%%zmm13\n\t"
384 "vmovdqa64 %3,%%zmm15\n\t"
385 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
386 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
387 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
388 "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
389 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
390 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
391 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
392 "vpxorq %%zmm15,%%zmm14,%%zmm14"
393 :
394 : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
395 "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
396 }
397 asm volatile("vmovntdq %%zmm2,%0\n\t"
398 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
399 "vmovntdq %%zmm3,%1\n\t"
400 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
401 "vmovntdq %%zmm10,%2\n\t"
402 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
403 "vmovntdq %%zmm11,%3\n\t"
404 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
405 "vmovntdq %%zmm4,%4\n\t"
406 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
407 "vmovntdq %%zmm6,%5\n\t"
408 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
409 "vmovntdq %%zmm12,%6\n\t"
410 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
411 "vmovntdq %%zmm14,%7\n\t"
412 "vpxorq %%zmm14,%%zmm14,%%zmm14"
413 :
414 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
415 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
416 "m" (q[d+128]), "m" (q[d+192]));
417 }
418
419 asm volatile("sfence" : : : "memory");
420 kernel_fpu_end();
421 }
422
raid6_avx5124_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)423 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
424 size_t bytes, void **ptrs)
425 {
426 u8 **dptr = (u8 **)ptrs;
427 u8 *p, *q;
428 int d, z, z0;
429
430 z0 = stop; /* P/Q right side optimization */
431 p = dptr[disks-2]; /* XOR parity */
432 q = dptr[disks-1]; /* RS syndrome */
433
434 kernel_fpu_begin();
435
436 asm volatile("vmovdqa64 %0,%%zmm0"
437 :: "m" (raid6_avx512_constants.x1d[0]));
438
439 for (d = 0 ; d < bytes ; d += 256) {
440 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
441 "vmovdqa64 %1,%%zmm6\n\t"
442 "vmovdqa64 %2,%%zmm12\n\t"
443 "vmovdqa64 %3,%%zmm14\n\t"
444 "vmovdqa64 %4,%%zmm2\n\t"
445 "vmovdqa64 %5,%%zmm3\n\t"
446 "vmovdqa64 %6,%%zmm10\n\t"
447 "vmovdqa64 %7,%%zmm11\n\t"
448 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
449 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
450 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
451 "vpxorq %%zmm14,%%zmm11,%%zmm11"
452 :
453 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
454 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
455 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
456 "m" (p[d+192]));
457 /* P/Q data pages */
458 for (z = z0-1 ; z >= start ; z--) {
459 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
460 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
461 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
462 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
463 "prefetchnta %0\n\t"
464 "prefetchnta %2\n\t"
465 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
466 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
467 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
468 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
469 "vpmovm2b %%k1,%%zmm5\n\t"
470 "vpmovm2b %%k2,%%zmm7\n\t"
471 "vpmovm2b %%k3,%%zmm13\n\t"
472 "vpmovm2b %%k4,%%zmm15\n\t"
473 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
474 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
475 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
476 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
477 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
478 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
479 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
480 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
481 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
482 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
483 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
484 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
485 "vmovdqa64 %0,%%zmm5\n\t"
486 "vmovdqa64 %1,%%zmm7\n\t"
487 "vmovdqa64 %2,%%zmm13\n\t"
488 "vmovdqa64 %3,%%zmm15\n\t"
489 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
490 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
491 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
492 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
493 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
494 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
495 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
496 "vpxorq %%zmm15,%%zmm14,%%zmm14"
497 :
498 : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
499 "m" (dptr[z][d+128]),
500 "m" (dptr[z][d+192]));
501 }
502 asm volatile("prefetchnta %0\n\t"
503 "prefetchnta %1\n\t"
504 :
505 : "m" (q[d]), "m" (q[d+128]));
506 /* P/Q left side optimization */
507 for (z = start-1 ; z >= 0 ; z--) {
508 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
509 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
510 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
511 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
512 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
513 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
514 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
515 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
516 "vpmovm2b %%k1,%%zmm5\n\t"
517 "vpmovm2b %%k2,%%zmm7\n\t"
518 "vpmovm2b %%k3,%%zmm13\n\t"
519 "vpmovm2b %%k4,%%zmm15\n\t"
520 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
521 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
522 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
523 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
524 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
525 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
526 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
527 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
528 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
529 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
530 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
531 "vpxorq %%zmm15,%%zmm14,%%zmm14"
532 :
533 : );
534 }
535 asm volatile("vmovntdq %%zmm2,%0\n\t"
536 "vmovntdq %%zmm3,%1\n\t"
537 "vmovntdq %%zmm10,%2\n\t"
538 "vmovntdq %%zmm11,%3\n\t"
539 "vpxorq %4,%%zmm4,%%zmm4\n\t"
540 "vpxorq %5,%%zmm6,%%zmm6\n\t"
541 "vpxorq %6,%%zmm12,%%zmm12\n\t"
542 "vpxorq %7,%%zmm14,%%zmm14\n\t"
543 "vmovntdq %%zmm4,%4\n\t"
544 "vmovntdq %%zmm6,%5\n\t"
545 "vmovntdq %%zmm12,%6\n\t"
546 "vmovntdq %%zmm14,%7"
547 :
548 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
549 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
550 "m" (q[d+128]), "m" (q[d+192]));
551 }
552 asm volatile("sfence" : : : "memory");
553 kernel_fpu_end();
554 }
555 const struct raid6_calls raid6_avx512x4 = {
556 raid6_avx5124_gen_syndrome,
557 raid6_avx5124_xor_syndrome,
558 raid6_have_avx512,
559 "avx512x4",
560 1 /* Has cache hints */
561 };
562 #endif
563
564 #endif /* CONFIG_AS_AVX512 */
565