1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_H
3 #define _ASM_X86_XOR_H
4
5 /*
6 * Optimized RAID-5 checksumming functions for SSE.
7 */
8
9 /*
10 * Cache avoiding checksumming functions utilizing KNI instructions
11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12 */
13
14 /*
15 * Based on
16 * High-speed RAID5 checksumming functions utilizing SSE instructions.
17 * Copyright (C) 1998 Ingo Molnar.
18 */
19
20 /*
21 * x86-64 changes / gcc fixes from Andi Kleen.
22 * Copyright 2002 Andi Kleen, SuSE Labs.
23 *
24 * This hasn't been optimized for the hammer yet, but there are likely
25 * no advantages to be gotten from x86-64 here anyways.
26 */
27
28 #include <asm/fpu/api.h>
29
30 #ifdef CONFIG_X86_32
31 /* reduce register pressure */
32 # define XOR_CONSTANT_CONSTRAINT "i"
33 #else
34 # define XOR_CONSTANT_CONSTRAINT "re"
35 #endif
36
37 #define OFFS(x) "16*("#x")"
38 #define PF_OFFS(x) "256+16*("#x")"
39 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
40 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
41 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
42 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
43 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
45 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
46 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
47 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
48 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
49 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
50 #define NOP(x)
51
52 #define BLK64(pf, op, i) \
53 pf(i) \
54 op(i, 0) \
55 op(i + 1, 1) \
56 op(i + 2, 2) \
57 op(i + 3, 3)
58
59 static void
xor_sse_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)60 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
61 {
62 unsigned long lines = bytes >> 8;
63
64 kernel_fpu_begin();
65
66 asm volatile(
67 #undef BLOCK
68 #define BLOCK(i) \
69 LD(i, 0) \
70 LD(i + 1, 1) \
71 PF1(i) \
72 PF1(i + 2) \
73 LD(i + 2, 2) \
74 LD(i + 3, 3) \
75 PF0(i + 4) \
76 PF0(i + 6) \
77 XO1(i, 0) \
78 XO1(i + 1, 1) \
79 XO1(i + 2, 2) \
80 XO1(i + 3, 3) \
81 ST(i, 0) \
82 ST(i + 1, 1) \
83 ST(i + 2, 2) \
84 ST(i + 3, 3) \
85
86
87 PF0(0)
88 PF0(2)
89
90 " .align 32 ;\n"
91 " 1: ;\n"
92
93 BLOCK(0)
94 BLOCK(4)
95 BLOCK(8)
96 BLOCK(12)
97
98 " add %[inc], %[p1] ;\n"
99 " add %[inc], %[p2] ;\n"
100 " dec %[cnt] ;\n"
101 " jnz 1b ;\n"
102 : [cnt] "+r" (lines),
103 [p1] "+r" (p1), [p2] "+r" (p2)
104 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
105 : "memory");
106
107 kernel_fpu_end();
108 }
109
110 static void
xor_sse_2_pf64(unsigned long bytes,unsigned long * p1,unsigned long * p2)111 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
112 {
113 unsigned long lines = bytes >> 8;
114
115 kernel_fpu_begin();
116
117 asm volatile(
118 #undef BLOCK
119 #define BLOCK(i) \
120 BLK64(PF0, LD, i) \
121 BLK64(PF1, XO1, i) \
122 BLK64(NOP, ST, i) \
123
124 " .align 32 ;\n"
125 " 1: ;\n"
126
127 BLOCK(0)
128 BLOCK(4)
129 BLOCK(8)
130 BLOCK(12)
131
132 " add %[inc], %[p1] ;\n"
133 " add %[inc], %[p2] ;\n"
134 " dec %[cnt] ;\n"
135 " jnz 1b ;\n"
136 : [cnt] "+r" (lines),
137 [p1] "+r" (p1), [p2] "+r" (p2)
138 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
139 : "memory");
140
141 kernel_fpu_end();
142 }
143
144 static void
xor_sse_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)145 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
146 unsigned long *p3)
147 {
148 unsigned long lines = bytes >> 8;
149
150 kernel_fpu_begin();
151
152 asm volatile(
153 #undef BLOCK
154 #define BLOCK(i) \
155 PF1(i) \
156 PF1(i + 2) \
157 LD(i, 0) \
158 LD(i + 1, 1) \
159 LD(i + 2, 2) \
160 LD(i + 3, 3) \
161 PF2(i) \
162 PF2(i + 2) \
163 PF0(i + 4) \
164 PF0(i + 6) \
165 XO1(i, 0) \
166 XO1(i + 1, 1) \
167 XO1(i + 2, 2) \
168 XO1(i + 3, 3) \
169 XO2(i, 0) \
170 XO2(i + 1, 1) \
171 XO2(i + 2, 2) \
172 XO2(i + 3, 3) \
173 ST(i, 0) \
174 ST(i + 1, 1) \
175 ST(i + 2, 2) \
176 ST(i + 3, 3) \
177
178
179 PF0(0)
180 PF0(2)
181
182 " .align 32 ;\n"
183 " 1: ;\n"
184
185 BLOCK(0)
186 BLOCK(4)
187 BLOCK(8)
188 BLOCK(12)
189
190 " add %[inc], %[p1] ;\n"
191 " add %[inc], %[p2] ;\n"
192 " add %[inc], %[p3] ;\n"
193 " dec %[cnt] ;\n"
194 " jnz 1b ;\n"
195 : [cnt] "+r" (lines),
196 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
197 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
198 : "memory");
199
200 kernel_fpu_end();
201 }
202
203 static void
xor_sse_3_pf64(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)204 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
205 unsigned long *p3)
206 {
207 unsigned long lines = bytes >> 8;
208
209 kernel_fpu_begin();
210
211 asm volatile(
212 #undef BLOCK
213 #define BLOCK(i) \
214 BLK64(PF0, LD, i) \
215 BLK64(PF1, XO1, i) \
216 BLK64(PF2, XO2, i) \
217 BLK64(NOP, ST, i) \
218
219 " .align 32 ;\n"
220 " 1: ;\n"
221
222 BLOCK(0)
223 BLOCK(4)
224 BLOCK(8)
225 BLOCK(12)
226
227 " add %[inc], %[p1] ;\n"
228 " add %[inc], %[p2] ;\n"
229 " add %[inc], %[p3] ;\n"
230 " dec %[cnt] ;\n"
231 " jnz 1b ;\n"
232 : [cnt] "+r" (lines),
233 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
234 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
235 : "memory");
236
237 kernel_fpu_end();
238 }
239
240 static void
xor_sse_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)241 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
242 unsigned long *p3, unsigned long *p4)
243 {
244 unsigned long lines = bytes >> 8;
245
246 kernel_fpu_begin();
247
248 asm volatile(
249 #undef BLOCK
250 #define BLOCK(i) \
251 PF1(i) \
252 PF1(i + 2) \
253 LD(i, 0) \
254 LD(i + 1, 1) \
255 LD(i + 2, 2) \
256 LD(i + 3, 3) \
257 PF2(i) \
258 PF2(i + 2) \
259 XO1(i, 0) \
260 XO1(i + 1, 1) \
261 XO1(i + 2, 2) \
262 XO1(i + 3, 3) \
263 PF3(i) \
264 PF3(i + 2) \
265 PF0(i + 4) \
266 PF0(i + 6) \
267 XO2(i, 0) \
268 XO2(i + 1, 1) \
269 XO2(i + 2, 2) \
270 XO2(i + 3, 3) \
271 XO3(i, 0) \
272 XO3(i + 1, 1) \
273 XO3(i + 2, 2) \
274 XO3(i + 3, 3) \
275 ST(i, 0) \
276 ST(i + 1, 1) \
277 ST(i + 2, 2) \
278 ST(i + 3, 3) \
279
280
281 PF0(0)
282 PF0(2)
283
284 " .align 32 ;\n"
285 " 1: ;\n"
286
287 BLOCK(0)
288 BLOCK(4)
289 BLOCK(8)
290 BLOCK(12)
291
292 " add %[inc], %[p1] ;\n"
293 " add %[inc], %[p2] ;\n"
294 " add %[inc], %[p3] ;\n"
295 " add %[inc], %[p4] ;\n"
296 " dec %[cnt] ;\n"
297 " jnz 1b ;\n"
298 : [cnt] "+r" (lines), [p1] "+r" (p1),
299 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
300 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
301 : "memory");
302
303 kernel_fpu_end();
304 }
305
306 static void
xor_sse_4_pf64(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)307 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
308 unsigned long *p3, unsigned long *p4)
309 {
310 unsigned long lines = bytes >> 8;
311
312 kernel_fpu_begin();
313
314 asm volatile(
315 #undef BLOCK
316 #define BLOCK(i) \
317 BLK64(PF0, LD, i) \
318 BLK64(PF1, XO1, i) \
319 BLK64(PF2, XO2, i) \
320 BLK64(PF3, XO3, i) \
321 BLK64(NOP, ST, i) \
322
323 " .align 32 ;\n"
324 " 1: ;\n"
325
326 BLOCK(0)
327 BLOCK(4)
328 BLOCK(8)
329 BLOCK(12)
330
331 " add %[inc], %[p1] ;\n"
332 " add %[inc], %[p2] ;\n"
333 " add %[inc], %[p3] ;\n"
334 " add %[inc], %[p4] ;\n"
335 " dec %[cnt] ;\n"
336 " jnz 1b ;\n"
337 : [cnt] "+r" (lines), [p1] "+r" (p1),
338 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
339 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
340 : "memory");
341
342 kernel_fpu_end();
343 }
344
345 static void
xor_sse_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)346 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
347 unsigned long *p3, unsigned long *p4, unsigned long *p5)
348 {
349 unsigned long lines = bytes >> 8;
350
351 kernel_fpu_begin();
352
353 asm volatile(
354 #undef BLOCK
355 #define BLOCK(i) \
356 PF1(i) \
357 PF1(i + 2) \
358 LD(i, 0) \
359 LD(i + 1, 1) \
360 LD(i + 2, 2) \
361 LD(i + 3, 3) \
362 PF2(i) \
363 PF2(i + 2) \
364 XO1(i, 0) \
365 XO1(i + 1, 1) \
366 XO1(i + 2, 2) \
367 XO1(i + 3, 3) \
368 PF3(i) \
369 PF3(i + 2) \
370 XO2(i, 0) \
371 XO2(i + 1, 1) \
372 XO2(i + 2, 2) \
373 XO2(i + 3, 3) \
374 PF4(i) \
375 PF4(i + 2) \
376 PF0(i + 4) \
377 PF0(i + 6) \
378 XO3(i, 0) \
379 XO3(i + 1, 1) \
380 XO3(i + 2, 2) \
381 XO3(i + 3, 3) \
382 XO4(i, 0) \
383 XO4(i + 1, 1) \
384 XO4(i + 2, 2) \
385 XO4(i + 3, 3) \
386 ST(i, 0) \
387 ST(i + 1, 1) \
388 ST(i + 2, 2) \
389 ST(i + 3, 3) \
390
391
392 PF0(0)
393 PF0(2)
394
395 " .align 32 ;\n"
396 " 1: ;\n"
397
398 BLOCK(0)
399 BLOCK(4)
400 BLOCK(8)
401 BLOCK(12)
402
403 " add %[inc], %[p1] ;\n"
404 " add %[inc], %[p2] ;\n"
405 " add %[inc], %[p3] ;\n"
406 " add %[inc], %[p4] ;\n"
407 " add %[inc], %[p5] ;\n"
408 " dec %[cnt] ;\n"
409 " jnz 1b ;\n"
410 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
411 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
412 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
413 : "memory");
414
415 kernel_fpu_end();
416 }
417
418 static void
xor_sse_5_pf64(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)419 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
420 unsigned long *p3, unsigned long *p4, unsigned long *p5)
421 {
422 unsigned long lines = bytes >> 8;
423
424 kernel_fpu_begin();
425
426 asm volatile(
427 #undef BLOCK
428 #define BLOCK(i) \
429 BLK64(PF0, LD, i) \
430 BLK64(PF1, XO1, i) \
431 BLK64(PF2, XO2, i) \
432 BLK64(PF3, XO3, i) \
433 BLK64(PF4, XO4, i) \
434 BLK64(NOP, ST, i) \
435
436 " .align 32 ;\n"
437 " 1: ;\n"
438
439 BLOCK(0)
440 BLOCK(4)
441 BLOCK(8)
442 BLOCK(12)
443
444 " add %[inc], %[p1] ;\n"
445 " add %[inc], %[p2] ;\n"
446 " add %[inc], %[p3] ;\n"
447 " add %[inc], %[p4] ;\n"
448 " add %[inc], %[p5] ;\n"
449 " dec %[cnt] ;\n"
450 " jnz 1b ;\n"
451 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
452 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
453 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
454 : "memory");
455
456 kernel_fpu_end();
457 }
458
459 static struct xor_block_template xor_block_sse_pf64 = {
460 .name = "prefetch64-sse",
461 .do_2 = xor_sse_2_pf64,
462 .do_3 = xor_sse_3_pf64,
463 .do_4 = xor_sse_4_pf64,
464 .do_5 = xor_sse_5_pf64,
465 };
466
467 #undef LD
468 #undef XO1
469 #undef XO2
470 #undef XO3
471 #undef XO4
472 #undef ST
473 #undef NOP
474 #undef BLK64
475 #undef BLOCK
476
477 #undef XOR_CONSTANT_CONSTRAINT
478
479 #ifdef CONFIG_X86_32
480 # include <asm/xor_32.h>
481 #else
482 # include <asm/xor_64.h>
483 #endif
484
485 #define XOR_SELECT_TEMPLATE(FASTEST) \
486 AVX_SELECT(FASTEST)
487
488 #endif /* _ASM_X86_XOR_H */
489