2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
22 #define STACKFRAMESIZE 256
23 #define STK_REG(i) (112 + ((i)-14)*8)
27 .section __ex_table,"a"
35 .section __ex_table,"a"
44 .section __ex_table,"a"
52 .section __ex_table,"a"
60 ld r16,STK_REG(r16)(r1)
61 ld r15,STK_REG(r15)(r1)
62 ld r14,STK_REG(r14)(r1)
65 ld r0,STACKFRAMESIZE+16(r1)
68 #endif /* CONFIG_ALTIVEC */
71 ld r22,STK_REG(r22)(r1)
72 ld r21,STK_REG(r21)(r1)
73 ld r20,STK_REG(r20)(r1)
74 ld r19,STK_REG(r19)(r1)
75 ld r18,STK_REG(r18)(r1)
76 ld r17,STK_REG(r17)(r1)
77 ld r16,STK_REG(r16)(r1)
78 ld r15,STK_REG(r15)(r1)
79 ld r14,STK_REG(r14)(r1)
81 addi r1,r1,STACKFRAMESIZE
86 b __copy_tofrom_user_base
89 _GLOBAL(__copy_tofrom_user_power7)
111 /* Get the source 8B aligned */
139 stdu r1,-STACKFRAMESIZE(r1)
140 std r14,STK_REG(r14)(r1)
141 std r15,STK_REG(r15)(r1)
142 std r16,STK_REG(r16)(r1)
143 std r17,STK_REG(r17)(r1)
144 std r18,STK_REG(r18)(r1)
145 std r19,STK_REG(r19)(r1)
146 std r20,STK_REG(r20)(r1)
147 std r21,STK_REG(r21)(r1)
148 std r22,STK_REG(r22)(r1)
149 std r0,STACKFRAMESIZE+16(r1)
154 /* Now do cacheline (128B) sized loads and stores. */
187 err2; std r19,104(r3)
188 err2; std r20,112(r3)
189 err2; std r21,120(r3)
195 ld r14,STK_REG(r14)(r1)
196 ld r15,STK_REG(r15)(r1)
197 ld r16,STK_REG(r16)(r1)
198 ld r17,STK_REG(r17)(r1)
199 ld r18,STK_REG(r18)(r1)
200 ld r19,STK_REG(r19)(r1)
201 ld r20,STK_REG(r20)(r1)
202 ld r21,STK_REG(r21)(r1)
203 ld r22,STK_REG(r22)(r1)
204 addi r1,r1,STACKFRAMESIZE
206 /* Up to 127B to go */
230 /* Up to 63B to go */
243 /* Up to 31B to go */
252 9: clrldi r5,r5,(64-4)
254 /* Up to 15B to go */
258 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
284 .Lunwind_stack_nonvmx_copy:
285 addi r1,r1,STACKFRAMESIZE
288 #ifdef CONFIG_ALTIVEC
292 stdu r1,-STACKFRAMESIZE(r1)
293 bl .enter_vmx_usercopy
295 ld r0,STACKFRAMESIZE+16(r1)
296 ld r3,STACKFRAMESIZE+48(r1)
297 ld r4,STACKFRAMESIZE+56(r1)
298 ld r5,STACKFRAMESIZE+64(r1)
302 * We prefetch both the source and destination using enhanced touch
303 * instructions. We use a stream ID of 0 for the load side and
304 * 1 for the store side.
308 ori r9,r9,1 /* stream=1 */
310 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
314 1: lis r0,0x0E00 /* depth=7 */
317 ori r10,r7,1 /* stream=1 */
319 lis r8,0x8000 /* GO=1 */
327 dcbtst r0,r10,0b01010
329 dcbt r0,r8,0b01010 /* GO */
332 beq .Lunwind_stack_nonvmx_copy
335 * If source and destination are not relatively aligned we use a
336 * slower permute loop.
339 rldicl. r6,r6,0,(64-4)
340 bne .Lvmx_unaligned_copy
342 /* Get the destination 16B aligned */
373 /* Get the desination 128B aligned */
405 err3; stvx vr1,r3,r10
406 err3; stvx vr0,r3,r11
412 std r14,STK_REG(r14)(r1)
413 std r15,STK_REG(r15)(r1)
414 std r16,STK_REG(r16)(r1)
424 * Now do cacheline sized loads and stores. By this stage the
425 * cacheline stores are also cacheline aligned.
440 err4; stvx vr5,r3,r10
441 err4; stvx vr4,r3,r11
442 err4; stvx vr3,r3,r12
443 err4; stvx vr2,r3,r14
444 err4; stvx vr1,r3,r15
445 err4; stvx vr0,r3,r16
449 ld r14,STK_REG(r14)(r1)
450 ld r15,STK_REG(r15)(r1)
451 ld r16,STK_REG(r16)(r1)
453 /* Up to 127B to go */
466 err3; stvx vr1,r3,r10
467 err3; stvx vr0,r3,r11
484 /* Up to 15B to go */
485 11: clrldi r5,r5,(64-4)
509 15: addi r1,r1,STACKFRAMESIZE
510 b .exit_vmx_usercopy /* tail call optimise */
512 .Lvmx_unaligned_copy:
513 /* Get the destination 16B aligned */
537 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
546 /* Get the desination 128B aligned */
556 lvsl vr16,0,r4 /* Setup permute control vector */
562 vperm vr8,vr0,vr1,vr16
570 vperm vr8,vr0,vr1,vr16
572 vperm vr9,vr1,vr0,vr16
580 vperm vr8,vr0,vr3,vr16
582 vperm vr9,vr3,vr2,vr16
584 vperm vr10,vr2,vr1,vr16
586 vperm vr11,vr1,vr0,vr16
590 err3; stvx vr10,r3,r10
591 err3; stvx vr11,r3,r11
597 std r14,STK_REG(r14)(r1)
598 std r15,STK_REG(r15)(r1)
599 std r16,STK_REG(r16)(r1)
609 * Now do cacheline sized loads and stores. By this stage the
610 * cacheline stores are also cacheline aligned.
615 vperm vr8,vr0,vr7,vr16
617 vperm vr9,vr7,vr6,vr16
619 vperm vr10,vr6,vr5,vr16
621 vperm vr11,vr5,vr4,vr16
623 vperm vr12,vr4,vr3,vr16
625 vperm vr13,vr3,vr2,vr16
627 vperm vr14,vr2,vr1,vr16
629 vperm vr15,vr1,vr0,vr16
633 err4; stvx vr10,r3,r10
634 err4; stvx vr11,r3,r11
635 err4; stvx vr12,r3,r12
636 err4; stvx vr13,r3,r14
637 err4; stvx vr14,r3,r15
638 err4; stvx vr15,r3,r16
642 ld r14,STK_REG(r14)(r1)
643 ld r15,STK_REG(r15)(r1)
644 ld r16,STK_REG(r16)(r1)
646 /* Up to 127B to go */
653 vperm vr8,vr0,vr3,vr16
655 vperm vr9,vr3,vr2,vr16
657 vperm vr10,vr2,vr1,vr16
659 vperm vr11,vr1,vr0,vr16
663 err3; stvx vr10,r3,r10
664 err3; stvx vr11,r3,r11
669 vperm vr8,vr0,vr1,vr16
671 vperm vr9,vr1,vr0,vr16
679 vperm vr8,vr0,vr1,vr16
684 /* Up to 15B to go */
685 11: clrldi r5,r5,(64-4)
686 addi r4,r4,-16 /* Unwind the +16 load offset */
689 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
712 15: addi r1,r1,STACKFRAMESIZE
713 b .exit_vmx_usercopy /* tail call optimise */
714 #endif /* CONFiG_ALTIVEC */