arch/alpha/lib/memcpy.c

   1 /*
   2  *  linux/arch/alpha/lib/memcpy.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  */
   6
   7 /*
   8  * This is a reasonably optimized memcpy() routine.
   9  */
  10
  11 /*
  12  * Note that the C code is written to be optimized into good assembly. However,
  13  * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
  14  * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
  15  * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
  16  */
  17
  18 #include <linux/types.h>
  19 #include <linux/export.h>
  20
  21 /*
  22  * This should be done in one go with ldq_u*2/mask/stq_u. Do it
  23  * with a macro so that we can fix it up later..
  24  */
  25 #define ALIGN_DEST_TO8_UP(d,s,n) \
  26         while (d & 7) { \
  27                 if (n <= 0) return; \
  28                 n--; \
  29                 *(char *) d = *(char *) s; \
  30                 d++; s++; \
  31         }
  32 #define ALIGN_DEST_TO8_DN(d,s,n) \
  33         while (d & 7) { \
  34                 if (n <= 0) return; \
  35                 n--; \
  36                 d--; s--; \
  37                 *(char *) d = *(char *) s; \
  38         }
  39
  40 /*
  41  * This should similarly be done with ldq_u*2/mask/stq. The destination
  42  * is aligned, but we don't fill in a full quad-word
  43  */
  44 #define DO_REST_UP(d,s,n) \
  45         while (n > 0) { \
  46                 n--; \
  47                 *(char *) d = *(char *) s; \
  48                 d++; s++; \
  49         }
  50 #define DO_REST_DN(d,s,n) \
  51         while (n > 0) { \
  52                 n--; \
  53                 d--; s--; \
  54                 *(char *) d = *(char *) s; \
  55         }
  56
  57 /*
  58  * This should be done with ldq/mask/stq. The source and destination are
  59  * aligned, but we don't fill in a full quad-word
  60  */
  61 #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
  62 #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
  63
  64 /*
  65  * This does unaligned memory copies. We want to avoid storing to
  66  * an unaligned address, as that would do a read-modify-write cycle.
  67  * We also want to avoid double-reading the unaligned reads.
  68  *
  69  * Note the ordering to try to avoid load (and address generation) latencies.
  70  */
  71 static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
  72                                           long n)
  73 {
  74         ALIGN_DEST_TO8_UP(d,s,n);
  75         n -= 8;                 /* to avoid compare against 8 in the loop */
  76         if (n >= 0) {
  77                 unsigned long low_word, high_word;
  78                 __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
  79                 do {
  80                         unsigned long tmp;
  81                         __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
  82                         n -= 8;
  83                         __asm__("extql %1,%2,%0"
  84                                 :"=r" (low_word)
  85                                 :"r" (low_word), "r" (s));
  86                         __asm__("extqh %1,%2,%0"
  87                                 :"=r" (tmp)
  88                                 :"r" (high_word), "r" (s));
  89                         s += 8;
  90                         *(unsigned long *) d = low_word | tmp;
  91                         d += 8;
  92                         low_word = high_word;
  93                 } while (n >= 0);
  94         }
  95         n += 8;
  96         DO_REST_UP(d,s,n);
  97 }
  98
  99 static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
 100                                           long n)
 101 {
 102         /* I don't understand AXP assembler well enough for this. -Tim */
 103         s += n;
 104         d += n;
 105         while (n--)
 106                 * (char *) --d = * (char *) --s;
 107 }
 108
 109 /*
 110  * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
 111  * for the load-store. I don't know why, but it would seem that using a floating
 112  * point register for the move seems to slow things down (very small difference,
 113  * though).
 114  *
 115  * Note the ordering to try to avoid load (and address generation) latencies.
 116  */
 117 static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
 118                                         long n)
 119 {
 120         ALIGN_DEST_TO8_UP(d,s,n);
 121         n -= 8;
 122         while (n >= 0) {
 123                 unsigned long tmp;
 124                 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 125                 n -= 8;
 126                 s += 8;
 127                 *(unsigned long *) d = tmp;
 128                 d += 8;
 129         }
 130         n += 8;
 131         DO_REST_ALIGNED_UP(d,s,n);
 132 }
 133 static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
 134                                         long n)
 135 {
 136         s += n;
 137         d += n;
 138         ALIGN_DEST_TO8_DN(d,s,n);
 139         n -= 8;
 140         while (n >= 0) {
 141                 unsigned long tmp;
 142                 s -= 8;
 143                 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 144                 n -= 8;
 145                 d -= 8;
 146                 *(unsigned long *) d = tmp;
 147         }
 148         n += 8;
 149         DO_REST_ALIGNED_DN(d,s,n);
 150 }
 151
 152 void * memcpy(void * dest, const void *src, size_t n)
 153 {
 154         if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
 155                 __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
 156                                      n);
 157                 return dest;
 158         }
 159         __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
 160         return dest;
 161 }
 162 EXPORT_SYMBOL(memcpy);