spelling.txt: "modeled" is spelt correctly
[cascardo/linux.git] / lib / raid6 / avx512.c
1 /* -*- linux-c -*- --------------------------------------------------------
2  *
3  *   Copyright (C) 2016 Intel Corporation
4  *
5  *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
6  *   Author: Megha Dey <megha.dey@linux.intel.com>
7  *
8  *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
9  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
10  *
11  *   This program is free software; you can redistribute it and/or modify
12  *   it under the terms of the GNU General Public License as published by
13  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
14  *   Boston MA 02111-1307, USA; either version 2 of the License, or
15  *   (at your option) any later version; incorporated herein by reference.
16  *
17  * -----------------------------------------------------------------------
18  */
19
20 /*
21  * AVX512 implementation of RAID-6 syndrome functions
22  *
23  */
24
25 #ifdef CONFIG_AS_AVX512
26
27 #include <linux/raid/pq.h>
28 #include "x86.h"
29
30 static const struct raid6_avx512_constants {
31         u64 x1d[8];
32 } raid6_avx512_constants __aligned(512) = {
33         { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
34           0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
35           0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
36           0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
37 };
38
39 static int raid6_have_avx512(void)
40 {
41         return boot_cpu_has(X86_FEATURE_AVX2) &&
42                 boot_cpu_has(X86_FEATURE_AVX) &&
43                 boot_cpu_has(X86_FEATURE_AVX512F) &&
44                 boot_cpu_has(X86_FEATURE_AVX512BW) &&
45                 boot_cpu_has(X86_FEATURE_AVX512VL) &&
46                 boot_cpu_has(X86_FEATURE_AVX512DQ);
47 }
48
49 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
50 {
51         u8 **dptr = (u8 **)ptrs;
52         u8 *p, *q;
53         int d, z, z0;
54
55         z0 = disks - 3;         /* Highest data disk */
56         p = dptr[z0+1];         /* XOR parity */
57         q = dptr[z0+2];         /* RS syndrome */
58
59         kernel_fpu_begin();
60
61         asm volatile("vmovdqa64 %0,%%zmm0\n\t"
62                      "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
63                      :
64                      : "m" (raid6_avx512_constants.x1d[0]));
65
66         for (d = 0; d < bytes; d += 64) {
67                 asm volatile("prefetchnta %0\n\t"
68                              "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
69                              "prefetchnta %1\n\t"
70                              "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
71                              "vmovdqa64 %1,%%zmm6"
72                              :
73                              : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
74                 for (z = z0-2; z >= 0; z--) {
75                         asm volatile("prefetchnta %0\n\t"
76                                      "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
77                                      "vpmovm2b %%k1,%%zmm5\n\t"
78                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
79                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
80                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
81                                      "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
82                                      "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
83                                      "vmovdqa64 %0,%%zmm6"
84                                      :
85                                      : "m" (dptr[z][d]));
86                 }
87                 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
88                              "vpmovm2b %%k1,%%zmm5\n\t"
89                              "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
90                              "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
91                              "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
92                              "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
93                              "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
94                              "vmovntdq %%zmm2,%0\n\t"
95                              "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
96                              "vmovntdq %%zmm4,%1\n\t"
97                              "vpxorq %%zmm4,%%zmm4,%%zmm4"
98                              :
99                              : "m" (p[d]), "m" (q[d]));
100         }
101
102         asm volatile("sfence" : : : "memory");
103         kernel_fpu_end();
104 }
105
106 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
107                                        size_t bytes, void **ptrs)
108 {
109         u8 **dptr = (u8 **)ptrs;
110         u8 *p, *q;
111         int d, z, z0;
112
113         z0 = stop;              /* P/Q right side optimization */
114         p = dptr[disks-2];      /* XOR parity */
115         q = dptr[disks-1];      /* RS syndrome */
116
117         kernel_fpu_begin();
118
119         asm volatile("vmovdqa64 %0,%%zmm0"
120                      : : "m" (raid6_avx512_constants.x1d[0]));
121
122         for (d = 0 ; d < bytes ; d += 64) {
123                 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
124                              "vmovdqa64 %1,%%zmm2\n\t"
125                              "vpxorq %%zmm4,%%zmm2,%%zmm2"
126                              :
127                              : "m" (dptr[z0][d]),  "m" (p[d]));
128                 /* P/Q data pages */
129                 for (z = z0-1 ; z >= start ; z--) {
130                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
131                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
132                                      "vpmovm2b %%k1,%%zmm5\n\t"
133                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
134                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
135                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
136                                      "vmovdqa64 %0,%%zmm5\n\t"
137                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
138                                      "vpxorq %%zmm5,%%zmm4,%%zmm4"
139                                      :
140                                      : "m" (dptr[z][d]));
141                 }
142                 /* P/Q left side optimization */
143                 for (z = start-1 ; z >= 0 ; z--) {
144                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
145                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
146                                      "vpmovm2b %%k1,%%zmm5\n\t"
147                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
148                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
149                                      "vpxorq %%zmm5,%%zmm4,%%zmm4"
150                                      :
151                                      : );
152                 }
153                 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
154                 /* Don't use movntdq for r/w memory area < cache line */
155                              "vmovdqa64 %%zmm4,%0\n\t"
156                              "vmovdqa64 %%zmm2,%1"
157                              :
158                              : "m" (q[d]), "m" (p[d]));
159         }
160
161         asm volatile("sfence" : : : "memory");
162         kernel_fpu_end();
163 }
164
165 const struct raid6_calls raid6_avx512x1 = {
166         raid6_avx5121_gen_syndrome,
167         raid6_avx5121_xor_syndrome,
168         raid6_have_avx512,
169         "avx512x1",
170         1                       /* Has cache hints */
171 };
172
173 /*
174  * Unrolled-by-2 AVX512 implementation
175  */
176 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
177 {
178         u8 **dptr = (u8 **)ptrs;
179         u8 *p, *q;
180         int d, z, z0;
181
182         z0 = disks - 3;         /* Highest data disk */
183         p = dptr[z0+1];         /* XOR parity */
184         q = dptr[z0+2];         /* RS syndrome */
185
186         kernel_fpu_begin();
187
188         asm volatile("vmovdqa64 %0,%%zmm0\n\t"
189                      "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
190                      :
191                      : "m" (raid6_avx512_constants.x1d[0]));
192
193         /* We uniformly assume a single prefetch covers at least 64 bytes */
194         for (d = 0; d < bytes; d += 128) {
195                 asm volatile("prefetchnta %0\n\t"
196                              "prefetchnta %1\n\t"
197                              "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
198                              "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
199                              "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
200                              "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
201                              :
202                              : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
203                 for (z = z0-1; z >= 0; z--) {
204                         asm volatile("prefetchnta %0\n\t"
205                                      "prefetchnta %1\n\t"
206                                      "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
207                                      "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
208                                      "vpmovm2b %%k1,%%zmm5\n\t"
209                                      "vpmovm2b %%k2,%%zmm7\n\t"
210                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
211                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
212                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
213                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
214                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
215                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
216                                      "vmovdqa64 %0,%%zmm5\n\t"
217                                      "vmovdqa64 %1,%%zmm7\n\t"
218                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
219                                      "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
220                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
221                                      "vpxorq %%zmm7,%%zmm6,%%zmm6"
222                                      :
223                                      : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
224                 }
225                 asm volatile("vmovntdq %%zmm2,%0\n\t"
226                              "vmovntdq %%zmm3,%1\n\t"
227                              "vmovntdq %%zmm4,%2\n\t"
228                              "vmovntdq %%zmm6,%3"
229                              :
230                              : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
231                                "m" (q[d+64]));
232         }
233
234         asm volatile("sfence" : : : "memory");
235         kernel_fpu_end();
236 }
237
238 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
239                                        size_t bytes, void **ptrs)
240 {
241         u8 **dptr = (u8 **)ptrs;
242         u8 *p, *q;
243         int d, z, z0;
244
245         z0 = stop;              /* P/Q right side optimization */
246         p = dptr[disks-2];      /* XOR parity */
247         q = dptr[disks-1];      /* RS syndrome */
248
249         kernel_fpu_begin();
250
251         asm volatile("vmovdqa64 %0,%%zmm0"
252                      : : "m" (raid6_avx512_constants.x1d[0]));
253
254         for (d = 0 ; d < bytes ; d += 128) {
255                 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
256                              "vmovdqa64 %1,%%zmm6\n\t"
257                              "vmovdqa64 %2,%%zmm2\n\t"
258                              "vmovdqa64 %3,%%zmm3\n\t"
259                              "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
260                              "vpxorq %%zmm6,%%zmm3,%%zmm3"
261                              :
262                              : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
263                                "m" (p[d]), "m" (p[d+64]));
264                 /* P/Q data pages */
265                 for (z = z0-1 ; z >= start ; z--) {
266                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
267                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
268                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
269                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
270                                      "vpmovm2b %%k1,%%zmm5\n\t"
271                                      "vpmovm2b %%k2,%%zmm7\n\t"
272                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
273                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
274                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
275                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
276                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
277                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
278                                      "vmovdqa64 %0,%%zmm5\n\t"
279                                      "vmovdqa64 %1,%%zmm7\n\t"
280                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
281                                      "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
282                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
283                                      "vpxorq %%zmm7,%%zmm6,%%zmm6"
284                                      :
285                                      : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
286                 }
287                 /* P/Q left side optimization */
288                 for (z = start-1 ; z >= 0 ; z--) {
289                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
290                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
291                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
292                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
293                                      "vpmovm2b %%k1,%%zmm5\n\t"
294                                      "vpmovm2b %%k2,%%zmm7\n\t"
295                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
296                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
297                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
298                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
299                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
300                                      "vpxorq %%zmm7,%%zmm6,%%zmm6"
301                                      :
302                                      : );
303                 }
304                 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
305                              "vpxorq %1,%%zmm6,%%zmm6\n\t"
306                              /* Don't use movntdq for r/w
307                               * memory area < cache line
308                               */
309                              "vmovdqa64 %%zmm4,%0\n\t"
310                              "vmovdqa64 %%zmm6,%1\n\t"
311                              "vmovdqa64 %%zmm2,%2\n\t"
312                              "vmovdqa64 %%zmm3,%3"
313                              :
314                              : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
315                                "m" (p[d+64]));
316         }
317
318         asm volatile("sfence" : : : "memory");
319         kernel_fpu_end();
320 }
321
322 const struct raid6_calls raid6_avx512x2 = {
323         raid6_avx5122_gen_syndrome,
324         raid6_avx5122_xor_syndrome,
325         raid6_have_avx512,
326         "avx512x2",
327         1                       /* Has cache hints */
328 };
329
330 #ifdef CONFIG_X86_64
331
332 /*
333  * Unrolled-by-4 AVX2 implementation
334  */
335 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
336 {
337         u8 **dptr = (u8 **)ptrs;
338         u8 *p, *q;
339         int d, z, z0;
340
341         z0 = disks - 3;         /* Highest data disk */
342         p = dptr[z0+1];         /* XOR parity */
343         q = dptr[z0+2];         /* RS syndrome */
344
345         kernel_fpu_begin();
346
347         asm volatile("vmovdqa64 %0,%%zmm0\n\t"
348                      "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
349                      "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
350                      "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
351                      "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
352                      "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
353                      "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
354                      "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
355                      "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
356                      "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
357                      :
358                      : "m" (raid6_avx512_constants.x1d[0]));
359
360         for (d = 0; d < bytes; d += 256) {
361                 for (z = z0; z >= 0; z--) {
362                 asm volatile("prefetchnta %0\n\t"
363                              "prefetchnta %1\n\t"
364                              "prefetchnta %2\n\t"
365                              "prefetchnta %3\n\t"
366                              "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
367                              "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
368                              "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
369                              "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
370                              "vpmovm2b %%k1,%%zmm5\n\t"
371                              "vpmovm2b %%k2,%%zmm7\n\t"
372                              "vpmovm2b %%k3,%%zmm13\n\t"
373                              "vpmovm2b %%k4,%%zmm15\n\t"
374                              "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
375                              "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
376                              "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
377                              "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
378                              "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
379                              "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
380                              "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
381                              "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
382                              "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
383                              "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
384                              "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
385                              "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
386                              "vmovdqa64 %0,%%zmm5\n\t"
387                              "vmovdqa64 %1,%%zmm7\n\t"
388                              "vmovdqa64 %2,%%zmm13\n\t"
389                              "vmovdqa64 %3,%%zmm15\n\t"
390                              "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
391                              "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
392                              "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
393                              "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
394                              "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
395                              "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
396                              "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
397                              "vpxorq %%zmm15,%%zmm14,%%zmm14"
398                              :
399                              : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
400                                "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
401                 }
402                 asm volatile("vmovntdq %%zmm2,%0\n\t"
403                              "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
404                              "vmovntdq %%zmm3,%1\n\t"
405                              "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
406                              "vmovntdq %%zmm10,%2\n\t"
407                              "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
408                              "vmovntdq %%zmm11,%3\n\t"
409                              "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
410                              "vmovntdq %%zmm4,%4\n\t"
411                              "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
412                              "vmovntdq %%zmm6,%5\n\t"
413                              "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
414                              "vmovntdq %%zmm12,%6\n\t"
415                              "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
416                              "vmovntdq %%zmm14,%7\n\t"
417                              "vpxorq %%zmm14,%%zmm14,%%zmm14"
418                              :
419                              : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
420                                "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
421                                "m" (q[d+128]), "m" (q[d+192]));
422         }
423
424         asm volatile("sfence" : : : "memory");
425         kernel_fpu_end();
426 }
427
428 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
429                                        size_t bytes, void **ptrs)
430 {
431         u8 **dptr = (u8 **)ptrs;
432         u8 *p, *q;
433         int d, z, z0;
434
435         z0 = stop;              /* P/Q right side optimization */
436         p = dptr[disks-2];      /* XOR parity */
437         q = dptr[disks-1];      /* RS syndrome */
438
439         kernel_fpu_begin();
440
441         asm volatile("vmovdqa64 %0,%%zmm0"
442                      :: "m" (raid6_avx512_constants.x1d[0]));
443
444         for (d = 0 ; d < bytes ; d += 256) {
445                 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
446                              "vmovdqa64 %1,%%zmm6\n\t"
447                              "vmovdqa64 %2,%%zmm12\n\t"
448                              "vmovdqa64 %3,%%zmm14\n\t"
449                              "vmovdqa64 %4,%%zmm2\n\t"
450                              "vmovdqa64 %5,%%zmm3\n\t"
451                              "vmovdqa64 %6,%%zmm10\n\t"
452                              "vmovdqa64 %7,%%zmm11\n\t"
453                              "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
454                              "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
455                              "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
456                              "vpxorq %%zmm14,%%zmm11,%%zmm11"
457                              :
458                              : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
459                                "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
460                                "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
461                                "m" (p[d+192]));
462                 /* P/Q data pages */
463                 for (z = z0-1 ; z >= start ; z--) {
464                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
465                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
466                                      "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
467                                      "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
468                                      "prefetchnta %0\n\t"
469                                      "prefetchnta %2\n\t"
470                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
471                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
472                                      "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
473                                      "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
474                                      "vpmovm2b %%k1,%%zmm5\n\t"
475                                      "vpmovm2b %%k2,%%zmm7\n\t"
476                                      "vpmovm2b %%k3,%%zmm13\n\t"
477                                      "vpmovm2b %%k4,%%zmm15\n\t"
478                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
479                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
480                                      "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
481                                      "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
482                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
483                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
484                                      "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
485                                      "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
486                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
487                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
488                                      "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
489                                      "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
490                                      "vmovdqa64 %0,%%zmm5\n\t"
491                                      "vmovdqa64 %1,%%zmm7\n\t"
492                                      "vmovdqa64 %2,%%zmm13\n\t"
493                                      "vmovdqa64 %3,%%zmm15\n\t"
494                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
495                                      "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
496                                      "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
497                                      "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
498                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
499                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
500                                      "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
501                                      "vpxorq %%zmm15,%%zmm14,%%zmm14"
502                                      :
503                                      : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
504                                        "m" (dptr[z][d+128]),
505                                        "m" (dptr[z][d+192]));
506                 }
507                 asm volatile("prefetchnta %0\n\t"
508                              "prefetchnta %1\n\t"
509                              :
510                              : "m" (q[d]), "m" (q[d+128]));
511                 /* P/Q left side optimization */
512                 for (z = start-1 ; z >= 0 ; z--) {
513                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
514                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
515                                      "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
516                                      "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
517                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
518                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
519                                      "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
520                                      "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
521                                      "vpmovm2b %%k1,%%zmm5\n\t"
522                                      "vpmovm2b %%k2,%%zmm7\n\t"
523                                      "vpmovm2b %%k3,%%zmm13\n\t"
524                                      "vpmovm2b %%k4,%%zmm15\n\t"
525                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
526                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
527                                      "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
528                                      "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
529                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
530                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
531                                      "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
532                                      "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
533                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
534                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
535                                      "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
536                                      "vpxorq %%zmm15,%%zmm14,%%zmm14"
537                                      :
538                                      : );
539                 }
540                 asm volatile("vmovntdq %%zmm2,%0\n\t"
541                              "vmovntdq %%zmm3,%1\n\t"
542                              "vmovntdq %%zmm10,%2\n\t"
543                              "vmovntdq %%zmm11,%3\n\t"
544                              "vpxorq %4,%%zmm4,%%zmm4\n\t"
545                              "vpxorq %5,%%zmm6,%%zmm6\n\t"
546                              "vpxorq %6,%%zmm12,%%zmm12\n\t"
547                              "vpxorq %7,%%zmm14,%%zmm14\n\t"
548                              "vmovntdq %%zmm4,%4\n\t"
549                              "vmovntdq %%zmm6,%5\n\t"
550                              "vmovntdq %%zmm12,%6\n\t"
551                              "vmovntdq %%zmm14,%7"
552                              :
553                              : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
554                                "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
555                                "m" (q[d+128]), "m" (q[d+192]));
556         }
557         asm volatile("sfence" : : : "memory");
558         kernel_fpu_end();
559 }
560 const struct raid6_calls raid6_avx512x4 = {
561         raid6_avx5124_gen_syndrome,
562         raid6_avx5124_xor_syndrome,
563         raid6_have_avx512,
564         "avx512x4",
565         1                       /* Has cache hints */
566 };
567 #endif
568
569 #endif /* CONFIG_AS_AVX512 */