f560f83a3ab0511b462ce6cc072866712c37be0b
[cascardo/linux.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #define STACKFRAMESIZE  256
23 #define STK_REG(i)      (112 + ((i)-14)*8)
24
25         .macro err1
26 100:
27         .section __ex_table,"a"
28         .align 3
29         .llong 100b,.Ldo_err1
30         .previous
31         .endm
32
33         .macro err2
34 200:
35         .section __ex_table,"a"
36         .align 3
37         .llong 200b,.Ldo_err2
38         .previous
39         .endm
40
41 #ifdef CONFIG_ALTIVEC
42         .macro err3
43 300:
44         .section __ex_table,"a"
45         .align 3
46         .llong 300b,.Ldo_err3
47         .previous
48         .endm
49
50         .macro err4
51 400:
52         .section __ex_table,"a"
53         .align 3
54         .llong 400b,.Ldo_err4
55         .previous
56         .endm
57
58
59 .Ldo_err4:
60         ld      r16,STK_REG(r16)(r1)
61         ld      r15,STK_REG(r15)(r1)
62         ld      r14,STK_REG(r14)(r1)
63 .Ldo_err3:
64         bl      .exit_vmx_usercopy
65         ld      r0,STACKFRAMESIZE+16(r1)
66         mtlr    r0
67         b       .Lexit
68 #endif /* CONFIG_ALTIVEC */
69
70 .Ldo_err2:
71         ld      r22,STK_REG(r22)(r1)
72         ld      r21,STK_REG(r21)(r1)
73         ld      r20,STK_REG(r20)(r1)
74         ld      r19,STK_REG(r19)(r1)
75         ld      r18,STK_REG(r18)(r1)
76         ld      r17,STK_REG(r17)(r1)
77         ld      r16,STK_REG(r16)(r1)
78         ld      r15,STK_REG(r15)(r1)
79         ld      r14,STK_REG(r14)(r1)
80 .Lexit:
81         addi    r1,r1,STACKFRAMESIZE
82 .Ldo_err1:
83         ld      r3,48(r1)
84         ld      r4,56(r1)
85         ld      r5,64(r1)
86         b       __copy_tofrom_user_base
87
88
89 _GLOBAL(__copy_tofrom_user_power7)
90 #ifdef CONFIG_ALTIVEC
91         cmpldi  r5,16
92         cmpldi  cr1,r5,4096
93
94         std     r3,48(r1)
95         std     r4,56(r1)
96         std     r5,64(r1)
97
98         blt     .Lshort_copy
99         bgt     cr1,.Lvmx_copy
100 #else
101         cmpldi  r5,16
102
103         std     r3,48(r1)
104         std     r4,56(r1)
105         std     r5,64(r1)
106
107         blt     .Lshort_copy
108 #endif
109
110 .Lnonvmx_copy:
111         /* Get the source 8B aligned */
112         neg     r6,r4
113         mtocrf  0x01,r6
114         clrldi  r6,r6,(64-3)
115
116         bf      cr7*4+3,1f
117 err1;   lbz     r0,0(r4)
118         addi    r4,r4,1
119 err1;   stb     r0,0(r3)
120         addi    r3,r3,1
121
122 1:      bf      cr7*4+2,2f
123 err1;   lhz     r0,0(r4)
124         addi    r4,r4,2
125 err1;   sth     r0,0(r3)
126         addi    r3,r3,2
127
128 2:      bf      cr7*4+1,3f
129 err1;   lwz     r0,0(r4)
130         addi    r4,r4,4
131 err1;   stw     r0,0(r3)
132         addi    r3,r3,4
133
134 3:      sub     r5,r5,r6
135         cmpldi  r5,128
136         blt     5f
137
138         mflr    r0
139         stdu    r1,-STACKFRAMESIZE(r1)
140         std     r14,STK_REG(r14)(r1)
141         std     r15,STK_REG(r15)(r1)
142         std     r16,STK_REG(r16)(r1)
143         std     r17,STK_REG(r17)(r1)
144         std     r18,STK_REG(r18)(r1)
145         std     r19,STK_REG(r19)(r1)
146         std     r20,STK_REG(r20)(r1)
147         std     r21,STK_REG(r21)(r1)
148         std     r22,STK_REG(r22)(r1)
149         std     r0,STACKFRAMESIZE+16(r1)
150
151         srdi    r6,r5,7
152         mtctr   r6
153
154         /* Now do cacheline (128B) sized loads and stores. */
155         .align  5
156 4:
157 err2;   ld      r0,0(r4)
158 err2;   ld      r6,8(r4)
159 err2;   ld      r7,16(r4)
160 err2;   ld      r8,24(r4)
161 err2;   ld      r9,32(r4)
162 err2;   ld      r10,40(r4)
163 err2;   ld      r11,48(r4)
164 err2;   ld      r12,56(r4)
165 err2;   ld      r14,64(r4)
166 err2;   ld      r15,72(r4)
167 err2;   ld      r16,80(r4)
168 err2;   ld      r17,88(r4)
169 err2;   ld      r18,96(r4)
170 err2;   ld      r19,104(r4)
171 err2;   ld      r20,112(r4)
172 err2;   ld      r21,120(r4)
173         addi    r4,r4,128
174 err2;   std     r0,0(r3)
175 err2;   std     r6,8(r3)
176 err2;   std     r7,16(r3)
177 err2;   std     r8,24(r3)
178 err2;   std     r9,32(r3)
179 err2;   std     r10,40(r3)
180 err2;   std     r11,48(r3)
181 err2;   std     r12,56(r3)
182 err2;   std     r14,64(r3)
183 err2;   std     r15,72(r3)
184 err2;   std     r16,80(r3)
185 err2;   std     r17,88(r3)
186 err2;   std     r18,96(r3)
187 err2;   std     r19,104(r3)
188 err2;   std     r20,112(r3)
189 err2;   std     r21,120(r3)
190         addi    r3,r3,128
191         bdnz    4b
192
193         clrldi  r5,r5,(64-7)
194
195         ld      r14,STK_REG(r14)(r1)
196         ld      r15,STK_REG(r15)(r1)
197         ld      r16,STK_REG(r16)(r1)
198         ld      r17,STK_REG(r17)(r1)
199         ld      r18,STK_REG(r18)(r1)
200         ld      r19,STK_REG(r19)(r1)
201         ld      r20,STK_REG(r20)(r1)
202         ld      r21,STK_REG(r21)(r1)
203         ld      r22,STK_REG(r22)(r1)
204         addi    r1,r1,STACKFRAMESIZE
205
206         /* Up to 127B to go */
207 5:      srdi    r6,r5,4
208         mtocrf  0x01,r6
209
210 6:      bf      cr7*4+1,7f
211 err1;   ld      r0,0(r4)
212 err1;   ld      r6,8(r4)
213 err1;   ld      r7,16(r4)
214 err1;   ld      r8,24(r4)
215 err1;   ld      r9,32(r4)
216 err1;   ld      r10,40(r4)
217 err1;   ld      r11,48(r4)
218 err1;   ld      r12,56(r4)
219         addi    r4,r4,64
220 err1;   std     r0,0(r3)
221 err1;   std     r6,8(r3)
222 err1;   std     r7,16(r3)
223 err1;   std     r8,24(r3)
224 err1;   std     r9,32(r3)
225 err1;   std     r10,40(r3)
226 err1;   std     r11,48(r3)
227 err1;   std     r12,56(r3)
228         addi    r3,r3,64
229
230         /* Up to 63B to go */
231 7:      bf      cr7*4+2,8f
232 err1;   ld      r0,0(r4)
233 err1;   ld      r6,8(r4)
234 err1;   ld      r7,16(r4)
235 err1;   ld      r8,24(r4)
236         addi    r4,r4,32
237 err1;   std     r0,0(r3)
238 err1;   std     r6,8(r3)
239 err1;   std     r7,16(r3)
240 err1;   std     r8,24(r3)
241         addi    r3,r3,32
242
243         /* Up to 31B to go */
244 8:      bf      cr7*4+3,9f
245 err1;   ld      r0,0(r4)
246 err1;   ld      r6,8(r4)
247         addi    r4,r4,16
248 err1;   std     r0,0(r3)
249 err1;   std     r6,8(r3)
250         addi    r3,r3,16
251
252 9:      clrldi  r5,r5,(64-4)
253
254         /* Up to 15B to go */
255 .Lshort_copy:
256         mtocrf  0x01,r5
257         bf      cr7*4+0,12f
258 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
259 err1;   lwz     r6,4(r4)
260         addi    r4,r4,8
261 err1;   stw     r0,0(r3)
262 err1;   stw     r6,4(r3)
263         addi    r3,r3,8
264
265 12:     bf      cr7*4+1,13f
266 err1;   lwz     r0,0(r4)
267         addi    r4,r4,4
268 err1;   stw     r0,0(r3)
269         addi    r3,r3,4
270
271 13:     bf      cr7*4+2,14f
272 err1;   lhz     r0,0(r4)
273         addi    r4,r4,2
274 err1;   sth     r0,0(r3)
275         addi    r3,r3,2
276
277 14:     bf      cr7*4+3,15f
278 err1;   lbz     r0,0(r4)
279 err1;   stb     r0,0(r3)
280
281 15:     li      r3,0
282         blr
283
284 .Lunwind_stack_nonvmx_copy:
285         addi    r1,r1,STACKFRAMESIZE
286         b       .Lnonvmx_copy
287
288 #ifdef CONFIG_ALTIVEC
289 .Lvmx_copy:
290         mflr    r0
291         std     r0,16(r1)
292         stdu    r1,-STACKFRAMESIZE(r1)
293         bl      .enter_vmx_usercopy
294         cmpwi   r3,0
295         ld      r0,STACKFRAMESIZE+16(r1)
296         ld      r3,STACKFRAMESIZE+48(r1)
297         ld      r4,STACKFRAMESIZE+56(r1)
298         ld      r5,STACKFRAMESIZE+64(r1)
299         mtlr    r0
300
301         /*
302          * We prefetch both the source and destination using enhanced touch
303          * instructions. We use a stream ID of 0 for the load side and
304          * 1 for the store side.
305          */
306         clrrdi  r6,r4,7
307         clrrdi  r9,r3,7
308         ori     r9,r9,1         /* stream=1 */
309
310         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
311         cmpldi  r7,0x3FF
312         ble     1f
313         li      r7,0x3FF
314 1:      lis     r0,0x0E00       /* depth=7 */
315         sldi    r7,r7,7
316         or      r7,r7,r0
317         ori     r10,r7,1        /* stream=1 */
318
319         lis     r8,0x8000       /* GO=1 */
320         clrldi  r8,r8,32
321
322 .machine push
323 .machine "power4"
324         dcbt    r0,r6,0b01000
325         dcbt    r0,r7,0b01010
326         dcbtst  r0,r9,0b01000
327         dcbtst  r0,r10,0b01010
328         eieio
329         dcbt    r0,r8,0b01010   /* GO */
330 .machine pop
331
332         beq     .Lunwind_stack_nonvmx_copy
333
334         /*
335          * If source and destination are not relatively aligned we use a
336          * slower permute loop.
337          */
338         xor     r6,r4,r3
339         rldicl. r6,r6,0,(64-4)
340         bne     .Lvmx_unaligned_copy
341
342         /* Get the destination 16B aligned */
343         neg     r6,r3
344         mtocrf  0x01,r6
345         clrldi  r6,r6,(64-4)
346
347         bf      cr7*4+3,1f
348 err3;   lbz     r0,0(r4)
349         addi    r4,r4,1
350 err3;   stb     r0,0(r3)
351         addi    r3,r3,1
352
353 1:      bf      cr7*4+2,2f
354 err3;   lhz     r0,0(r4)
355         addi    r4,r4,2
356 err3;   sth     r0,0(r3)
357         addi    r3,r3,2
358
359 2:      bf      cr7*4+1,3f
360 err3;   lwz     r0,0(r4)
361         addi    r4,r4,4
362 err3;   stw     r0,0(r3)
363         addi    r3,r3,4
364
365 3:      bf      cr7*4+0,4f
366 err3;   ld      r0,0(r4)
367         addi    r4,r4,8
368 err3;   std     r0,0(r3)
369         addi    r3,r3,8
370
371 4:      sub     r5,r5,r6
372
373         /* Get the desination 128B aligned */
374         neg     r6,r3
375         srdi    r7,r6,4
376         mtocrf  0x01,r7
377         clrldi  r6,r6,(64-7)
378
379         li      r9,16
380         li      r10,32
381         li      r11,48
382
383         bf      cr7*4+3,5f
384 err3;   lvx     vr1,r0,r4
385         addi    r4,r4,16
386 err3;   stvx    vr1,r0,r3
387         addi    r3,r3,16
388
389 5:      bf      cr7*4+2,6f
390 err3;   lvx     vr1,r0,r4
391 err3;   lvx     vr0,r4,r9
392         addi    r4,r4,32
393 err3;   stvx    vr1,r0,r3
394 err3;   stvx    vr0,r3,r9
395         addi    r3,r3,32
396
397 6:      bf      cr7*4+1,7f
398 err3;   lvx     vr3,r0,r4
399 err3;   lvx     vr2,r4,r9
400 err3;   lvx     vr1,r4,r10
401 err3;   lvx     vr0,r4,r11
402         addi    r4,r4,64
403 err3;   stvx    vr3,r0,r3
404 err3;   stvx    vr2,r3,r9
405 err3;   stvx    vr1,r3,r10
406 err3;   stvx    vr0,r3,r11
407         addi    r3,r3,64
408
409 7:      sub     r5,r5,r6
410         srdi    r6,r5,7
411
412         std     r14,STK_REG(r14)(r1)
413         std     r15,STK_REG(r15)(r1)
414         std     r16,STK_REG(r16)(r1)
415
416         li      r12,64
417         li      r14,80
418         li      r15,96
419         li      r16,112
420
421         mtctr   r6
422
423         /*
424          * Now do cacheline sized loads and stores. By this stage the
425          * cacheline stores are also cacheline aligned.
426          */
427         .align  5
428 8:
429 err4;   lvx     vr7,r0,r4
430 err4;   lvx     vr6,r4,r9
431 err4;   lvx     vr5,r4,r10
432 err4;   lvx     vr4,r4,r11
433 err4;   lvx     vr3,r4,r12
434 err4;   lvx     vr2,r4,r14
435 err4;   lvx     vr1,r4,r15
436 err4;   lvx     vr0,r4,r16
437         addi    r4,r4,128
438 err4;   stvx    vr7,r0,r3
439 err4;   stvx    vr6,r3,r9
440 err4;   stvx    vr5,r3,r10
441 err4;   stvx    vr4,r3,r11
442 err4;   stvx    vr3,r3,r12
443 err4;   stvx    vr2,r3,r14
444 err4;   stvx    vr1,r3,r15
445 err4;   stvx    vr0,r3,r16
446         addi    r3,r3,128
447         bdnz    8b
448
449         ld      r14,STK_REG(r14)(r1)
450         ld      r15,STK_REG(r15)(r1)
451         ld      r16,STK_REG(r16)(r1)
452
453         /* Up to 127B to go */
454         clrldi  r5,r5,(64-7)
455         srdi    r6,r5,4
456         mtocrf  0x01,r6
457
458         bf      cr7*4+1,9f
459 err3;   lvx     vr3,r0,r4
460 err3;   lvx     vr2,r4,r9
461 err3;   lvx     vr1,r4,r10
462 err3;   lvx     vr0,r4,r11
463         addi    r4,r4,64
464 err3;   stvx    vr3,r0,r3
465 err3;   stvx    vr2,r3,r9
466 err3;   stvx    vr1,r3,r10
467 err3;   stvx    vr0,r3,r11
468         addi    r3,r3,64
469
470 9:      bf      cr7*4+2,10f
471 err3;   lvx     vr1,r0,r4
472 err3;   lvx     vr0,r4,r9
473         addi    r4,r4,32
474 err3;   stvx    vr1,r0,r3
475 err3;   stvx    vr0,r3,r9
476         addi    r3,r3,32
477
478 10:     bf      cr7*4+3,11f
479 err3;   lvx     vr1,r0,r4
480         addi    r4,r4,16
481 err3;   stvx    vr1,r0,r3
482         addi    r3,r3,16
483
484         /* Up to 15B to go */
485 11:     clrldi  r5,r5,(64-4)
486         mtocrf  0x01,r5
487         bf      cr7*4+0,12f
488 err3;   ld      r0,0(r4)
489         addi    r4,r4,8
490 err3;   std     r0,0(r3)
491         addi    r3,r3,8
492
493 12:     bf      cr7*4+1,13f
494 err3;   lwz     r0,0(r4)
495         addi    r4,r4,4
496 err3;   stw     r0,0(r3)
497         addi    r3,r3,4
498
499 13:     bf      cr7*4+2,14f
500 err3;   lhz     r0,0(r4)
501         addi    r4,r4,2
502 err3;   sth     r0,0(r3)
503         addi    r3,r3,2
504
505 14:     bf      cr7*4+3,15f
506 err3;   lbz     r0,0(r4)
507 err3;   stb     r0,0(r3)
508
509 15:     addi    r1,r1,STACKFRAMESIZE
510         b       .exit_vmx_usercopy      /* tail call optimise */
511
512 .Lvmx_unaligned_copy:
513         /* Get the destination 16B aligned */
514         neg     r6,r3
515         mtocrf  0x01,r6
516         clrldi  r6,r6,(64-4)
517
518         bf      cr7*4+3,1f
519 err3;   lbz     r0,0(r4)
520         addi    r4,r4,1
521 err3;   stb     r0,0(r3)
522         addi    r3,r3,1
523
524 1:      bf      cr7*4+2,2f
525 err3;   lhz     r0,0(r4)
526         addi    r4,r4,2
527 err3;   sth     r0,0(r3)
528         addi    r3,r3,2
529
530 2:      bf      cr7*4+1,3f
531 err3;   lwz     r0,0(r4)
532         addi    r4,r4,4
533 err3;   stw     r0,0(r3)
534         addi    r3,r3,4
535
536 3:      bf      cr7*4+0,4f
537 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
538 err3;   lwz     r7,4(r4)
539         addi    r4,r4,8
540 err3;   stw     r0,0(r3)
541 err3;   stw     r7,4(r3)
542         addi    r3,r3,8
543
544 4:      sub     r5,r5,r6
545
546         /* Get the desination 128B aligned */
547         neg     r6,r3
548         srdi    r7,r6,4
549         mtocrf  0x01,r7
550         clrldi  r6,r6,(64-7)
551
552         li      r9,16
553         li      r10,32
554         li      r11,48
555
556         lvsl    vr16,0,r4       /* Setup permute control vector */
557 err3;   lvx     vr0,0,r4
558         addi    r4,r4,16
559
560         bf      cr7*4+3,5f
561 err3;   lvx     vr1,r0,r4
562         vperm   vr8,vr0,vr1,vr16
563         addi    r4,r4,16
564 err3;   stvx    vr8,r0,r3
565         addi    r3,r3,16
566         vor     vr0,vr1,vr1
567
568 5:      bf      cr7*4+2,6f
569 err3;   lvx     vr1,r0,r4
570         vperm   vr8,vr0,vr1,vr16
571 err3;   lvx     vr0,r4,r9
572         vperm   vr9,vr1,vr0,vr16
573         addi    r4,r4,32
574 err3;   stvx    vr8,r0,r3
575 err3;   stvx    vr9,r3,r9
576         addi    r3,r3,32
577
578 6:      bf      cr7*4+1,7f
579 err3;   lvx     vr3,r0,r4
580         vperm   vr8,vr0,vr3,vr16
581 err3;   lvx     vr2,r4,r9
582         vperm   vr9,vr3,vr2,vr16
583 err3;   lvx     vr1,r4,r10
584         vperm   vr10,vr2,vr1,vr16
585 err3;   lvx     vr0,r4,r11
586         vperm   vr11,vr1,vr0,vr16
587         addi    r4,r4,64
588 err3;   stvx    vr8,r0,r3
589 err3;   stvx    vr9,r3,r9
590 err3;   stvx    vr10,r3,r10
591 err3;   stvx    vr11,r3,r11
592         addi    r3,r3,64
593
594 7:      sub     r5,r5,r6
595         srdi    r6,r5,7
596
597         std     r14,STK_REG(r14)(r1)
598         std     r15,STK_REG(r15)(r1)
599         std     r16,STK_REG(r16)(r1)
600
601         li      r12,64
602         li      r14,80
603         li      r15,96
604         li      r16,112
605
606         mtctr   r6
607
608         /*
609          * Now do cacheline sized loads and stores. By this stage the
610          * cacheline stores are also cacheline aligned.
611          */
612         .align  5
613 8:
614 err4;   lvx     vr7,r0,r4
615         vperm   vr8,vr0,vr7,vr16
616 err4;   lvx     vr6,r4,r9
617         vperm   vr9,vr7,vr6,vr16
618 err4;   lvx     vr5,r4,r10
619         vperm   vr10,vr6,vr5,vr16
620 err4;   lvx     vr4,r4,r11
621         vperm   vr11,vr5,vr4,vr16
622 err4;   lvx     vr3,r4,r12
623         vperm   vr12,vr4,vr3,vr16
624 err4;   lvx     vr2,r4,r14
625         vperm   vr13,vr3,vr2,vr16
626 err4;   lvx     vr1,r4,r15
627         vperm   vr14,vr2,vr1,vr16
628 err4;   lvx     vr0,r4,r16
629         vperm   vr15,vr1,vr0,vr16
630         addi    r4,r4,128
631 err4;   stvx    vr8,r0,r3
632 err4;   stvx    vr9,r3,r9
633 err4;   stvx    vr10,r3,r10
634 err4;   stvx    vr11,r3,r11
635 err4;   stvx    vr12,r3,r12
636 err4;   stvx    vr13,r3,r14
637 err4;   stvx    vr14,r3,r15
638 err4;   stvx    vr15,r3,r16
639         addi    r3,r3,128
640         bdnz    8b
641
642         ld      r14,STK_REG(r14)(r1)
643         ld      r15,STK_REG(r15)(r1)
644         ld      r16,STK_REG(r16)(r1)
645
646         /* Up to 127B to go */
647         clrldi  r5,r5,(64-7)
648         srdi    r6,r5,4
649         mtocrf  0x01,r6
650
651         bf      cr7*4+1,9f
652 err3;   lvx     vr3,r0,r4
653         vperm   vr8,vr0,vr3,vr16
654 err3;   lvx     vr2,r4,r9
655         vperm   vr9,vr3,vr2,vr16
656 err3;   lvx     vr1,r4,r10
657         vperm   vr10,vr2,vr1,vr16
658 err3;   lvx     vr0,r4,r11
659         vperm   vr11,vr1,vr0,vr16
660         addi    r4,r4,64
661 err3;   stvx    vr8,r0,r3
662 err3;   stvx    vr9,r3,r9
663 err3;   stvx    vr10,r3,r10
664 err3;   stvx    vr11,r3,r11
665         addi    r3,r3,64
666
667 9:      bf      cr7*4+2,10f
668 err3;   lvx     vr1,r0,r4
669         vperm   vr8,vr0,vr1,vr16
670 err3;   lvx     vr0,r4,r9
671         vperm   vr9,vr1,vr0,vr16
672         addi    r4,r4,32
673 err3;   stvx    vr8,r0,r3
674 err3;   stvx    vr9,r3,r9
675         addi    r3,r3,32
676
677 10:     bf      cr7*4+3,11f
678 err3;   lvx     vr1,r0,r4
679         vperm   vr8,vr0,vr1,vr16
680         addi    r4,r4,16
681 err3;   stvx    vr8,r0,r3
682         addi    r3,r3,16
683
684         /* Up to 15B to go */
685 11:     clrldi  r5,r5,(64-4)
686         addi    r4,r4,-16       /* Unwind the +16 load offset */
687         mtocrf  0x01,r5
688         bf      cr7*4+0,12f
689 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
690 err3;   lwz     r6,4(r4)
691         addi    r4,r4,8
692 err3;   stw     r0,0(r3)
693 err3;   stw     r6,4(r3)
694         addi    r3,r3,8
695
696 12:     bf      cr7*4+1,13f
697 err3;   lwz     r0,0(r4)
698         addi    r4,r4,4
699 err3;   stw     r0,0(r3)
700         addi    r3,r3,4
701
702 13:     bf      cr7*4+2,14f
703 err3;   lhz     r0,0(r4)
704         addi    r4,r4,2
705 err3;   sth     r0,0(r3)
706         addi    r3,r3,2
707
708 14:     bf      cr7*4+3,15f
709 err3;   lbz     r0,0(r4)
710 err3;   stb     r0,0(r3)
711
712 15:     addi    r1,r1,STACKFRAMESIZE
713         b       .exit_vmx_usercopy      /* tail call optimise */
714 #endif /* CONFiG_ALTIVEC */