04524a2a0b88c9a8fd8c0f17cb292745a6b0c176
[cascardo/linux.git] / arch / powerpc / lib / memcpy_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2012
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #define STACKFRAMESIZE  256
23 #define STK_REG(i)      (112 + ((i)-14)*8)
24
25 _GLOBAL(memcpy_power7)
26 #ifdef CONFIG_ALTIVEC
27         cmpldi  r5,16
28         cmpldi  cr1,r5,4096
29
30         std     r3,48(r1)
31
32         blt     .Lshort_copy
33         bgt     cr1,.Lvmx_copy
34 #else
35         cmpldi  r5,16
36
37         std     r3,48(r1)
38
39         blt     .Lshort_copy
40 #endif
41
42 .Lnonvmx_copy:
43         /* Get the source 8B aligned */
44         neg     r6,r4
45         mtocrf  0x01,r6
46         clrldi  r6,r6,(64-3)
47
48         bf      cr7*4+3,1f
49         lbz     r0,0(r4)
50         addi    r4,r4,1
51         stb     r0,0(r3)
52         addi    r3,r3,1
53
54 1:      bf      cr7*4+2,2f
55         lhz     r0,0(r4)
56         addi    r4,r4,2
57         sth     r0,0(r3)
58         addi    r3,r3,2
59
60 2:      bf      cr7*4+1,3f
61         lwz     r0,0(r4)
62         addi    r4,r4,4
63         stw     r0,0(r3)
64         addi    r3,r3,4
65
66 3:      sub     r5,r5,r6
67         cmpldi  r5,128
68         blt     5f
69
70         mflr    r0
71         stdu    r1,-STACKFRAMESIZE(r1)
72         std     r14,STK_REG(R14)(r1)
73         std     r15,STK_REG(R15)(r1)
74         std     r16,STK_REG(R16)(r1)
75         std     r17,STK_REG(R17)(r1)
76         std     r18,STK_REG(R18)(r1)
77         std     r19,STK_REG(R19)(r1)
78         std     r20,STK_REG(R20)(r1)
79         std     r21,STK_REG(R21)(r1)
80         std     r22,STK_REG(R22)(r1)
81         std     r0,STACKFRAMESIZE+16(r1)
82
83         srdi    r6,r5,7
84         mtctr   r6
85
86         /* Now do cacheline (128B) sized loads and stores. */
87         .align  5
88 4:
89         ld      r0,0(r4)
90         ld      r6,8(r4)
91         ld      r7,16(r4)
92         ld      r8,24(r4)
93         ld      r9,32(r4)
94         ld      r10,40(r4)
95         ld      r11,48(r4)
96         ld      r12,56(r4)
97         ld      r14,64(r4)
98         ld      r15,72(r4)
99         ld      r16,80(r4)
100         ld      r17,88(r4)
101         ld      r18,96(r4)
102         ld      r19,104(r4)
103         ld      r20,112(r4)
104         ld      r21,120(r4)
105         addi    r4,r4,128
106         std     r0,0(r3)
107         std     r6,8(r3)
108         std     r7,16(r3)
109         std     r8,24(r3)
110         std     r9,32(r3)
111         std     r10,40(r3)
112         std     r11,48(r3)
113         std     r12,56(r3)
114         std     r14,64(r3)
115         std     r15,72(r3)
116         std     r16,80(r3)
117         std     r17,88(r3)
118         std     r18,96(r3)
119         std     r19,104(r3)
120         std     r20,112(r3)
121         std     r21,120(r3)
122         addi    r3,r3,128
123         bdnz    4b
124
125         clrldi  r5,r5,(64-7)
126
127         ld      r14,STK_REG(R14)(r1)
128         ld      r15,STK_REG(R15)(r1)
129         ld      r16,STK_REG(R16)(r1)
130         ld      r17,STK_REG(R17)(r1)
131         ld      r18,STK_REG(R18)(r1)
132         ld      r19,STK_REG(R19)(r1)
133         ld      r20,STK_REG(R20)(r1)
134         ld      r21,STK_REG(R21)(r1)
135         ld      r22,STK_REG(R22)(r1)
136         addi    r1,r1,STACKFRAMESIZE
137
138         /* Up to 127B to go */
139 5:      srdi    r6,r5,4
140         mtocrf  0x01,r6
141
142 6:      bf      cr7*4+1,7f
143         ld      r0,0(r4)
144         ld      r6,8(r4)
145         ld      r7,16(r4)
146         ld      r8,24(r4)
147         ld      r9,32(r4)
148         ld      r10,40(r4)
149         ld      r11,48(r4)
150         ld      r12,56(r4)
151         addi    r4,r4,64
152         std     r0,0(r3)
153         std     r6,8(r3)
154         std     r7,16(r3)
155         std     r8,24(r3)
156         std     r9,32(r3)
157         std     r10,40(r3)
158         std     r11,48(r3)
159         std     r12,56(r3)
160         addi    r3,r3,64
161
162         /* Up to 63B to go */
163 7:      bf      cr7*4+2,8f
164         ld      r0,0(r4)
165         ld      r6,8(r4)
166         ld      r7,16(r4)
167         ld      r8,24(r4)
168         addi    r4,r4,32
169         std     r0,0(r3)
170         std     r6,8(r3)
171         std     r7,16(r3)
172         std     r8,24(r3)
173         addi    r3,r3,32
174
175         /* Up to 31B to go */
176 8:      bf      cr7*4+3,9f
177         ld      r0,0(r4)
178         ld      r6,8(r4)
179         addi    r4,r4,16
180         std     r0,0(r3)
181         std     r6,8(r3)
182         addi    r3,r3,16
183
184 9:      clrldi  r5,r5,(64-4)
185
186         /* Up to 15B to go */
187 .Lshort_copy:
188         mtocrf  0x01,r5
189         bf      cr7*4+0,12f
190         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
191         lwz     r6,4(r4)
192         addi    r4,r4,8
193         stw     r0,0(r3)
194         stw     r6,4(r3)
195         addi    r3,r3,8
196
197 12:     bf      cr7*4+1,13f
198         lwz     r0,0(r4)
199         addi    r4,r4,4
200         stw     r0,0(r3)
201         addi    r3,r3,4
202
203 13:     bf      cr7*4+2,14f
204         lhz     r0,0(r4)
205         addi    r4,r4,2
206         sth     r0,0(r3)
207         addi    r3,r3,2
208
209 14:     bf      cr7*4+3,15f
210         lbz     r0,0(r4)
211         stb     r0,0(r3)
212
213 15:     ld      r3,48(r1)
214         blr
215
216 .Lunwind_stack_nonvmx_copy:
217         addi    r1,r1,STACKFRAMESIZE
218         b       .Lnonvmx_copy
219
220 #ifdef CONFIG_ALTIVEC
221 .Lvmx_copy:
222         mflr    r0
223         std     r4,56(r1)
224         std     r5,64(r1)
225         std     r0,16(r1)
226         stdu    r1,-STACKFRAMESIZE(r1)
227         bl      .enter_vmx_copy
228         cmpwi   r3,0
229         ld      r0,STACKFRAMESIZE+16(r1)
230         ld      r3,STACKFRAMESIZE+48(r1)
231         ld      r4,STACKFRAMESIZE+56(r1)
232         ld      r5,STACKFRAMESIZE+64(r1)
233         mtlr    r0
234
235         /*
236          * We prefetch both the source and destination using enhanced touch
237          * instructions. We use a stream ID of 0 for the load side and
238          * 1 for the store side.
239          */
240         clrrdi  r6,r4,7
241         clrrdi  r9,r3,7
242         ori     r9,r9,1         /* stream=1 */
243
244         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
245         cmpldi  cr1,r7,0x3FF
246         ble     cr1,1f
247         li      r7,0x3FF
248 1:      lis     r0,0x0E00       /* depth=7 */
249         sldi    r7,r7,7
250         or      r7,r7,r0
251         ori     r10,r7,1        /* stream=1 */
252
253         lis     r8,0x8000       /* GO=1 */
254         clrldi  r8,r8,32
255
256 .machine push
257 .machine "power4"
258         dcbt    r0,r6,0b01000
259         dcbt    r0,r7,0b01010
260         dcbtst  r0,r9,0b01000
261         dcbtst  r0,r10,0b01010
262         eieio
263         dcbt    r0,r8,0b01010   /* GO */
264 .machine pop
265
266         beq     .Lunwind_stack_nonvmx_copy
267
268         /*
269          * If source and destination are not relatively aligned we use a
270          * slower permute loop.
271          */
272         xor     r6,r4,r3
273         rldicl. r6,r6,0,(64-4)
274         bne     .Lvmx_unaligned_copy
275
276         /* Get the destination 16B aligned */
277         neg     r6,r3
278         mtocrf  0x01,r6
279         clrldi  r6,r6,(64-4)
280
281         bf      cr7*4+3,1f
282         lbz     r0,0(r4)
283         addi    r4,r4,1
284         stb     r0,0(r3)
285         addi    r3,r3,1
286
287 1:      bf      cr7*4+2,2f
288         lhz     r0,0(r4)
289         addi    r4,r4,2
290         sth     r0,0(r3)
291         addi    r3,r3,2
292
293 2:      bf      cr7*4+1,3f
294         lwz     r0,0(r4)
295         addi    r4,r4,4
296         stw     r0,0(r3)
297         addi    r3,r3,4
298
299 3:      bf      cr7*4+0,4f
300         ld      r0,0(r4)
301         addi    r4,r4,8
302         std     r0,0(r3)
303         addi    r3,r3,8
304
305 4:      sub     r5,r5,r6
306
307         /* Get the desination 128B aligned */
308         neg     r6,r3
309         srdi    r7,r6,4
310         mtocrf  0x01,r7
311         clrldi  r6,r6,(64-7)
312
313         li      r9,16
314         li      r10,32
315         li      r11,48
316
317         bf      cr7*4+3,5f
318         lvx     vr1,r0,r4
319         addi    r4,r4,16
320         stvx    vr1,r0,r3
321         addi    r3,r3,16
322
323 5:      bf      cr7*4+2,6f
324         lvx     vr1,r0,r4
325         lvx     vr0,r4,r9
326         addi    r4,r4,32
327         stvx    vr1,r0,r3
328         stvx    vr0,r3,r9
329         addi    r3,r3,32
330
331 6:      bf      cr7*4+1,7f
332         lvx     vr3,r0,r4
333         lvx     vr2,r4,r9
334         lvx     vr1,r4,r10
335         lvx     vr0,r4,r11
336         addi    r4,r4,64
337         stvx    vr3,r0,r3
338         stvx    vr2,r3,r9
339         stvx    vr1,r3,r10
340         stvx    vr0,r3,r11
341         addi    r3,r3,64
342
343 7:      sub     r5,r5,r6
344         srdi    r6,r5,7
345
346         std     r14,STK_REG(R14)(r1)
347         std     r15,STK_REG(R15)(r1)
348         std     r16,STK_REG(R16)(r1)
349
350         li      r12,64
351         li      r14,80
352         li      r15,96
353         li      r16,112
354
355         mtctr   r6
356
357         /*
358          * Now do cacheline sized loads and stores. By this stage the
359          * cacheline stores are also cacheline aligned.
360          */
361         .align  5
362 8:
363         lvx     vr7,r0,r4
364         lvx     vr6,r4,r9
365         lvx     vr5,r4,r10
366         lvx     vr4,r4,r11
367         lvx     vr3,r4,r12
368         lvx     vr2,r4,r14
369         lvx     vr1,r4,r15
370         lvx     vr0,r4,r16
371         addi    r4,r4,128
372         stvx    vr7,r0,r3
373         stvx    vr6,r3,r9
374         stvx    vr5,r3,r10
375         stvx    vr4,r3,r11
376         stvx    vr3,r3,r12
377         stvx    vr2,r3,r14
378         stvx    vr1,r3,r15
379         stvx    vr0,r3,r16
380         addi    r3,r3,128
381         bdnz    8b
382
383         ld      r14,STK_REG(R14)(r1)
384         ld      r15,STK_REG(R15)(r1)
385         ld      r16,STK_REG(R16)(r1)
386
387         /* Up to 127B to go */
388         clrldi  r5,r5,(64-7)
389         srdi    r6,r5,4
390         mtocrf  0x01,r6
391
392         bf      cr7*4+1,9f
393         lvx     vr3,r0,r4
394         lvx     vr2,r4,r9
395         lvx     vr1,r4,r10
396         lvx     vr0,r4,r11
397         addi    r4,r4,64
398         stvx    vr3,r0,r3
399         stvx    vr2,r3,r9
400         stvx    vr1,r3,r10
401         stvx    vr0,r3,r11
402         addi    r3,r3,64
403
404 9:      bf      cr7*4+2,10f
405         lvx     vr1,r0,r4
406         lvx     vr0,r4,r9
407         addi    r4,r4,32
408         stvx    vr1,r0,r3
409         stvx    vr0,r3,r9
410         addi    r3,r3,32
411
412 10:     bf      cr7*4+3,11f
413         lvx     vr1,r0,r4
414         addi    r4,r4,16
415         stvx    vr1,r0,r3
416         addi    r3,r3,16
417
418         /* Up to 15B to go */
419 11:     clrldi  r5,r5,(64-4)
420         mtocrf  0x01,r5
421         bf      cr7*4+0,12f
422         ld      r0,0(r4)
423         addi    r4,r4,8
424         std     r0,0(r3)
425         addi    r3,r3,8
426
427 12:     bf      cr7*4+1,13f
428         lwz     r0,0(r4)
429         addi    r4,r4,4
430         stw     r0,0(r3)
431         addi    r3,r3,4
432
433 13:     bf      cr7*4+2,14f
434         lhz     r0,0(r4)
435         addi    r4,r4,2
436         sth     r0,0(r3)
437         addi    r3,r3,2
438
439 14:     bf      cr7*4+3,15f
440         lbz     r0,0(r4)
441         stb     r0,0(r3)
442
443 15:     addi    r1,r1,STACKFRAMESIZE
444         ld      r3,48(r1)
445         b       .exit_vmx_copy          /* tail call optimise */
446
447 .Lvmx_unaligned_copy:
448         /* Get the destination 16B aligned */
449         neg     r6,r3
450         mtocrf  0x01,r6
451         clrldi  r6,r6,(64-4)
452
453         bf      cr7*4+3,1f
454         lbz     r0,0(r4)
455         addi    r4,r4,1
456         stb     r0,0(r3)
457         addi    r3,r3,1
458
459 1:      bf      cr7*4+2,2f
460         lhz     r0,0(r4)
461         addi    r4,r4,2
462         sth     r0,0(r3)
463         addi    r3,r3,2
464
465 2:      bf      cr7*4+1,3f
466         lwz     r0,0(r4)
467         addi    r4,r4,4
468         stw     r0,0(r3)
469         addi    r3,r3,4
470
471 3:      bf      cr7*4+0,4f
472         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
473         lwz     r7,4(r4)
474         addi    r4,r4,8
475         stw     r0,0(r3)
476         stw     r7,4(r3)
477         addi    r3,r3,8
478
479 4:      sub     r5,r5,r6
480
481         /* Get the desination 128B aligned */
482         neg     r6,r3
483         srdi    r7,r6,4
484         mtocrf  0x01,r7
485         clrldi  r6,r6,(64-7)
486
487         li      r9,16
488         li      r10,32
489         li      r11,48
490
491         lvsl    vr16,0,r4       /* Setup permute control vector */
492         lvx     vr0,0,r4
493         addi    r4,r4,16
494
495         bf      cr7*4+3,5f
496         lvx     vr1,r0,r4
497         vperm   vr8,vr0,vr1,vr16
498         addi    r4,r4,16
499         stvx    vr8,r0,r3
500         addi    r3,r3,16
501         vor     vr0,vr1,vr1
502
503 5:      bf      cr7*4+2,6f
504         lvx     vr1,r0,r4
505         vperm   vr8,vr0,vr1,vr16
506         lvx     vr0,r4,r9
507         vperm   vr9,vr1,vr0,vr16
508         addi    r4,r4,32
509         stvx    vr8,r0,r3
510         stvx    vr9,r3,r9
511         addi    r3,r3,32
512
513 6:      bf      cr7*4+1,7f
514         lvx     vr3,r0,r4
515         vperm   vr8,vr0,vr3,vr16
516         lvx     vr2,r4,r9
517         vperm   vr9,vr3,vr2,vr16
518         lvx     vr1,r4,r10
519         vperm   vr10,vr2,vr1,vr16
520         lvx     vr0,r4,r11
521         vperm   vr11,vr1,vr0,vr16
522         addi    r4,r4,64
523         stvx    vr8,r0,r3
524         stvx    vr9,r3,r9
525         stvx    vr10,r3,r10
526         stvx    vr11,r3,r11
527         addi    r3,r3,64
528
529 7:      sub     r5,r5,r6
530         srdi    r6,r5,7
531
532         std     r14,STK_REG(R14)(r1)
533         std     r15,STK_REG(R15)(r1)
534         std     r16,STK_REG(R16)(r1)
535
536         li      r12,64
537         li      r14,80
538         li      r15,96
539         li      r16,112
540
541         mtctr   r6
542
543         /*
544          * Now do cacheline sized loads and stores. By this stage the
545          * cacheline stores are also cacheline aligned.
546          */
547         .align  5
548 8:
549         lvx     vr7,r0,r4
550         vperm   vr8,vr0,vr7,vr16
551         lvx     vr6,r4,r9
552         vperm   vr9,vr7,vr6,vr16
553         lvx     vr5,r4,r10
554         vperm   vr10,vr6,vr5,vr16
555         lvx     vr4,r4,r11
556         vperm   vr11,vr5,vr4,vr16
557         lvx     vr3,r4,r12
558         vperm   vr12,vr4,vr3,vr16
559         lvx     vr2,r4,r14
560         vperm   vr13,vr3,vr2,vr16
561         lvx     vr1,r4,r15
562         vperm   vr14,vr2,vr1,vr16
563         lvx     vr0,r4,r16
564         vperm   vr15,vr1,vr0,vr16
565         addi    r4,r4,128
566         stvx    vr8,r0,r3
567         stvx    vr9,r3,r9
568         stvx    vr10,r3,r10
569         stvx    vr11,r3,r11
570         stvx    vr12,r3,r12
571         stvx    vr13,r3,r14
572         stvx    vr14,r3,r15
573         stvx    vr15,r3,r16
574         addi    r3,r3,128
575         bdnz    8b
576
577         ld      r14,STK_REG(R14)(r1)
578         ld      r15,STK_REG(R15)(r1)
579         ld      r16,STK_REG(R16)(r1)
580
581         /* Up to 127B to go */
582         clrldi  r5,r5,(64-7)
583         srdi    r6,r5,4
584         mtocrf  0x01,r6
585
586         bf      cr7*4+1,9f
587         lvx     vr3,r0,r4
588         vperm   vr8,vr0,vr3,vr16
589         lvx     vr2,r4,r9
590         vperm   vr9,vr3,vr2,vr16
591         lvx     vr1,r4,r10
592         vperm   vr10,vr2,vr1,vr16
593         lvx     vr0,r4,r11
594         vperm   vr11,vr1,vr0,vr16
595         addi    r4,r4,64
596         stvx    vr8,r0,r3
597         stvx    vr9,r3,r9
598         stvx    vr10,r3,r10
599         stvx    vr11,r3,r11
600         addi    r3,r3,64
601
602 9:      bf      cr7*4+2,10f
603         lvx     vr1,r0,r4
604         vperm   vr8,vr0,vr1,vr16
605         lvx     vr0,r4,r9
606         vperm   vr9,vr1,vr0,vr16
607         addi    r4,r4,32
608         stvx    vr8,r0,r3
609         stvx    vr9,r3,r9
610         addi    r3,r3,32
611
612 10:     bf      cr7*4+3,11f
613         lvx     vr1,r0,r4
614         vperm   vr8,vr0,vr1,vr16
615         addi    r4,r4,16
616         stvx    vr8,r0,r3
617         addi    r3,r3,16
618
619         /* Up to 15B to go */
620 11:     clrldi  r5,r5,(64-4)
621         addi    r4,r4,-16       /* Unwind the +16 load offset */
622         mtocrf  0x01,r5
623         bf      cr7*4+0,12f
624         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
625         lwz     r6,4(r4)
626         addi    r4,r4,8
627         stw     r0,0(r3)
628         stw     r6,4(r3)
629         addi    r3,r3,8
630
631 12:     bf      cr7*4+1,13f
632         lwz     r0,0(r4)
633         addi    r4,r4,4
634         stw     r0,0(r3)
635         addi    r3,r3,4
636
637 13:     bf      cr7*4+2,14f
638         lhz     r0,0(r4)
639         addi    r4,r4,2
640         sth     r0,0(r3)
641         addi    r3,r3,2
642
643 14:     bf      cr7*4+3,15f
644         lbz     r0,0(r4)
645         stb     r0,0(r3)
646
647 15:     addi    r1,r1,STACKFRAMESIZE
648         ld      r3,48(r1)
649         b       .exit_vmx_copy          /* tail call optimise */
650 #endif /* CONFiG_ALTIVEC */