arch/x86/mm/gup.c

   1 /*
   2  * Lockless get_user_pages_fast for x86
   3  *
   4  * Copyright (C) 2008 Nick Piggin
   5  * Copyright (C) 2008 Novell Inc.
   6  */
   7 #include <linux/sched.h>
   8 #include <linux/mm.h>
   9 #include <linux/vmstat.h>
  10 #include <linux/highmem.h>
  11 #include <linux/swap.h>
  12 #include <linux/memremap.h>
  13
  14 #include <asm/pgtable.h>
  15
  16 static inline pte_t gup_get_pte(pte_t *ptep)
  17 {
  18 #ifndef CONFIG_X86_PAE
  19         return READ_ONCE(*ptep);
  20 #else
  21         /*
  22          * With get_user_pages_fast, we walk down the pagetables without taking
  23          * any locks.  For this we would like to load the pointers atomically,
  24          * but that is not possible (without expensive cmpxchg8b) on PAE.  What
  25          * we do have is the guarantee that a pte will only either go from not
  26          * present to present, or present to not present or both -- it will not
  27          * switch to a completely different present page without a TLB flush in
  28          * between; something that we are blocking by holding interrupts off.
  29          *
  30          * Setting ptes from not present to present goes:
  31          * ptep->pte_high = h;
  32          * smp_wmb();
  33          * ptep->pte_low = l;
  34          *
  35          * And present to not present goes:
  36          * ptep->pte_low = 0;
  37          * smp_wmb();
  38          * ptep->pte_high = 0;
  39          *
  40          * We must ensure here that the load of pte_low sees l iff pte_high
  41          * sees h. We load pte_high *after* loading pte_low, which ensures we
  42          * don't see an older value of pte_high.  *Then* we recheck pte_low,
  43          * which ensures that we haven't picked up a changed pte high. We might
  44          * have got rubbish values from pte_low and pte_high, but we are
  45          * guaranteed that pte_low will not have the present bit set *unless*
  46          * it is 'l'. And get_user_pages_fast only operates on present ptes, so
  47          * we're safe.
  48          *
  49          * gup_get_pte should not be used or copied outside gup.c without being
  50          * very careful -- it does not atomically load the pte or anything that
  51          * is likely to be useful for you.
  52          */
  53         pte_t pte;
  54
  55 retry:
  56         pte.pte_low = ptep->pte_low;
  57         smp_rmb();
  58         pte.pte_high = ptep->pte_high;
  59         smp_rmb();
  60         if (unlikely(pte.pte_low != ptep->pte_low))
  61                 goto retry;
  62
  63         return pte;
  64 #endif
  65 }
  66
  67 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
  68 {
  69         while ((*nr) - nr_start) {
  70                 struct page *page = pages[--(*nr)];
  71
  72                 ClearPageReferenced(page);
  73                 put_page(page);
  74         }
  75 }
  76
  77 /*
  78  * 'pteval' can come from a pte, pmd or pud.  We only check
  79  * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
  80  * same value on all 3 types.
  81  */
  82 static inline int pte_allows_gup(unsigned long pteval, int write)
  83 {
  84         unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
  85
  86         if (write)
  87                 need_pte_bits |= _PAGE_RW;
  88
  89         if ((pteval & need_pte_bits) != need_pte_bits)
  90                 return 0;
  91
  92         return 1;
  93 }
  94
  95 /*
  96  * The performance critical leaf functions are made noinline otherwise gcc
  97  * inlines everything into a single function which results in too much
  98  * register pressure.
  99  */
 100 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 101                 unsigned long end, int write, struct page **pages, int *nr)
 102 {
 103         struct dev_pagemap *pgmap = NULL;
 104         int nr_start = *nr;
 105         pte_t *ptep;
 106
 107         ptep = pte_offset_map(&pmd, addr);
 108         do {
 109                 pte_t pte = gup_get_pte(ptep);
 110                 struct page *page;
 111
 112                 /* Similar to the PMD case, NUMA hinting must take slow path */
 113                 if (pte_protnone(pte)) {
 114                         pte_unmap(ptep);
 115                         return 0;
 116                 }
 117
 118                 page = pte_page(pte);
 119                 if (pte_devmap(pte)) {
 120                         pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
 121                         if (unlikely(!pgmap)) {
 122                                 undo_dev_pagemap(nr, nr_start, pages);
 123                                 pte_unmap(ptep);
 124                                 return 0;
 125                         }
 126                 } else if (!pte_allows_gup(pte_val(pte), write) ||
 127                            pte_special(pte)) {
 128                         pte_unmap(ptep);
 129                         return 0;
 130                 }
 131                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 132                 get_page(page);
 133                 put_dev_pagemap(pgmap);
 134                 SetPageReferenced(page);
 135                 pages[*nr] = page;
 136                 (*nr)++;
 137
 138         } while (ptep++, addr += PAGE_SIZE, addr != end);
 139         pte_unmap(ptep - 1);
 140
 141         return 1;
 142 }
 143
 144 static inline void get_head_page_multiple(struct page *page, int nr)
 145 {
 146         VM_BUG_ON_PAGE(page != compound_head(page), page);
 147         VM_BUG_ON_PAGE(page_count(page) == 0, page);
 148         atomic_add(nr, &page->_count);
 149         SetPageReferenced(page);
 150 }
 151
 152 static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
 153                 unsigned long end, struct page **pages, int *nr)
 154 {
 155         int nr_start = *nr;
 156         unsigned long pfn = pmd_pfn(pmd);
 157         struct dev_pagemap *pgmap = NULL;
 158
 159         pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 160         do {
 161                 struct page *page = pfn_to_page(pfn);
 162
 163                 pgmap = get_dev_pagemap(pfn, pgmap);
 164                 if (unlikely(!pgmap)) {
 165                         undo_dev_pagemap(nr, nr_start, pages);
 166                         return 0;
 167                 }
 168                 SetPageReferenced(page);
 169                 pages[*nr] = page;
 170                 get_page(page);
 171                 put_dev_pagemap(pgmap);
 172                 (*nr)++;
 173                 pfn++;
 174         } while (addr += PAGE_SIZE, addr != end);
 175         return 1;
 176 }
 177
 178 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 179                 unsigned long end, int write, struct page **pages, int *nr)
 180 {
 181         struct page *head, *page;
 182         int refs;
 183
 184         if (!pte_allows_gup(pmd_val(pmd), write))
 185                 return 0;
 186
 187         VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
 188         if (pmd_devmap(pmd))
 189                 return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
 190
 191         /* hugepages are never "special" */
 192         VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
 193
 194         refs = 0;
 195         head = pmd_page(pmd);
 196         page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 197         do {
 198                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
 199                 pages[*nr] = page;
 200                 (*nr)++;
 201                 page++;
 202                 refs++;
 203         } while (addr += PAGE_SIZE, addr != end);
 204         get_head_page_multiple(head, refs);
 205
 206         return 1;
 207 }
 208
 209 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 210                 int write, struct page **pages, int *nr)
 211 {
 212         unsigned long next;
 213         pmd_t *pmdp;
 214
 215         pmdp = pmd_offset(&pud, addr);
 216         do {
 217                 pmd_t pmd = *pmdp;
 218
 219                 next = pmd_addr_end(addr, end);
 220                 if (pmd_none(pmd))
 221                         return 0;
 222                 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
 223                         /*
 224                          * NUMA hinting faults need to be handled in the GUP
 225                          * slowpath for accounting purposes and so that they
 226                          * can be serialised against THP migration.
 227                          */
 228                         if (pmd_protnone(pmd))
 229                                 return 0;
 230                         if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
 231                                 return 0;
 232                 } else {
 233                         if (!gup_pte_range(pmd, addr, next, write, pages, nr))
 234                                 return 0;
 235                 }
 236         } while (pmdp++, addr = next, addr != end);
 237
 238         return 1;
 239 }
 240
 241 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 242                 unsigned long end, int write, struct page **pages, int *nr)
 243 {
 244         struct page *head, *page;
 245         int refs;
 246
 247         if (!pte_allows_gup(pud_val(pud), write))
 248                 return 0;
 249         /* hugepages are never "special" */
 250         VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
 251         VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
 252
 253         refs = 0;
 254         head = pud_page(pud);
 255         page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 256         do {
 257                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
 258                 pages[*nr] = page;
 259                 (*nr)++;
 260                 page++;
 261                 refs++;
 262         } while (addr += PAGE_SIZE, addr != end);
 263         get_head_page_multiple(head, refs);
 264
 265         return 1;
 266 }
 267
 268 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 269                         int write, struct page **pages, int *nr)
 270 {
 271         unsigned long next;
 272         pud_t *pudp;
 273
 274         pudp = pud_offset(&pgd, addr);
 275         do {
 276                 pud_t pud = *pudp;
 277
 278                 next = pud_addr_end(addr, end);
 279                 if (pud_none(pud))
 280                         return 0;
 281                 if (unlikely(pud_large(pud))) {
 282                         if (!gup_huge_pud(pud, addr, next, write, pages, nr))
 283                                 return 0;
 284                 } else {
 285                         if (!gup_pmd_range(pud, addr, next, write, pages, nr))
 286                                 return 0;
 287                 }
 288         } while (pudp++, addr = next, addr != end);
 289
 290         return 1;
 291 }
 292
 293 /*
 294  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
 295  * back to the regular GUP.
 296  */
 297 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 298                           struct page **pages)
 299 {
 300         struct mm_struct *mm = current->mm;
 301         unsigned long addr, len, end;
 302         unsigned long next;
 303         unsigned long flags;
 304         pgd_t *pgdp;
 305         int nr = 0;
 306
 307         start &= PAGE_MASK;
 308         addr = start;
 309         len = (unsigned long) nr_pages << PAGE_SHIFT;
 310         end = start + len;
 311         if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
 312                                         (void __user *)start, len)))
 313                 return 0;
 314
 315         /*
 316          * XXX: batch / limit 'nr', to avoid large irq off latency
 317          * needs some instrumenting to determine the common sizes used by
 318          * important workloads (eg. DB2), and whether limiting the batch size
 319          * will decrease performance.
 320          *
 321          * It seems like we're in the clear for the moment. Direct-IO is
 322          * the main guy that batches up lots of get_user_pages, and even
 323          * they are limited to 64-at-a-time which is not so many.
 324          */
 325         /*
 326          * This doesn't prevent pagetable teardown, but does prevent
 327          * the pagetables and pages from being freed on x86.
 328          *
 329          * So long as we atomically load page table pointers versus teardown
 330          * (which we do on x86, with the above PAE exception), we can follow the
 331          * address down to the the page and take a ref on it.
 332          */
 333         local_irq_save(flags);
 334         pgdp = pgd_offset(mm, addr);
 335         do {
 336                 pgd_t pgd = *pgdp;
 337
 338                 next = pgd_addr_end(addr, end);
 339                 if (pgd_none(pgd))
 340                         break;
 341                 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
 342                         break;
 343         } while (pgdp++, addr = next, addr != end);
 344         local_irq_restore(flags);
 345
 346         return nr;
 347 }
 348
 349 /**
 350  * get_user_pages_fast() - pin user pages in memory
 351  * @start:      starting user address
 352  * @nr_pages:   number of pages from start to pin
 353  * @write:      whether pages will be written to
 354  * @pages:      array that receives pointers to the pages pinned.
 355  *              Should be at least nr_pages long.
 356  *
 357  * Attempt to pin user pages in memory without taking mm->mmap_sem.
 358  * If not successful, it will fall back to taking the lock and
 359  * calling get_user_pages().
 360  *
 361  * Returns number of pages pinned. This may be fewer than the number
 362  * requested. If nr_pages is 0 or negative, returns 0. If no pages
 363  * were pinned, returns -errno.
 364  */
 365 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 366                         struct page **pages)
 367 {
 368         struct mm_struct *mm = current->mm;
 369         unsigned long addr, len, end;
 370         unsigned long next;
 371         pgd_t *pgdp;
 372         int nr = 0;
 373
 374         start &= PAGE_MASK;
 375         addr = start;
 376         len = (unsigned long) nr_pages << PAGE_SHIFT;
 377
 378         end = start + len;
 379         if (end < start)
 380                 goto slow_irqon;
 381
 382 #ifdef CONFIG_X86_64
 383         if (end >> __VIRTUAL_MASK_SHIFT)
 384                 goto slow_irqon;
 385 #endif
 386
 387         /*
 388          * XXX: batch / limit 'nr', to avoid large irq off latency
 389          * needs some instrumenting to determine the common sizes used by
 390          * important workloads (eg. DB2), and whether limiting the batch size
 391          * will decrease performance.
 392          *
 393          * It seems like we're in the clear for the moment. Direct-IO is
 394          * the main guy that batches up lots of get_user_pages, and even
 395          * they are limited to 64-at-a-time which is not so many.
 396          */
 397         /*
 398          * This doesn't prevent pagetable teardown, but does prevent
 399          * the pagetables and pages from being freed on x86.
 400          *
 401          * So long as we atomically load page table pointers versus teardown
 402          * (which we do on x86, with the above PAE exception), we can follow the
 403          * address down to the the page and take a ref on it.
 404          */
 405         local_irq_disable();
 406         pgdp = pgd_offset(mm, addr);
 407         do {
 408                 pgd_t pgd = *pgdp;
 409
 410                 next = pgd_addr_end(addr, end);
 411                 if (pgd_none(pgd))
 412                         goto slow;
 413                 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
 414                         goto slow;
 415         } while (pgdp++, addr = next, addr != end);
 416         local_irq_enable();
 417
 418         VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
 419         return nr;
 420
 421         {
 422                 int ret;
 423
 424 slow:
 425                 local_irq_enable();
 426 slow_irqon:
 427                 /* Try to get the remaining pages with get_user_pages */
 428                 start += nr << PAGE_SHIFT;
 429                 pages += nr;
 430
 431                 ret = get_user_pages_unlocked(start,
 432                                               (end - start) >> PAGE_SHIFT,
 433                                               write, 0, pages);
 434
 435                 /* Have to be a bit careful with return values */
 436                 if (nr > 0) {
 437                         if (ret < 0)
 438                                 ret = nr;
 439                         else
 440                                 ret += nr;
 441                 }
 442
 443                 return ret;
 444         }
 445 }