mm: memcontrol: rewrite charge API

[cascardo/linux.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index f009a14..1cbe1e5 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2551,55 +2551,63 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
         return NOTIFY_OK;
  }
  
-
-/* See mem_cgroup_try_charge() for details */
-enum {
-       CHARGE_OK,              /* success */
-       CHARGE_RETRY,           /* need to retry but retry is not bad */
-       CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
-       CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
-};
-
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                               unsigned int nr_pages, unsigned int min_pages,
-                               bool invoke_oom)
+static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                     unsigned int nr_pages)
  {
-       unsigned long csize = nr_pages * PAGE_SIZE;
+       unsigned int batch = max(CHARGE_BATCH, nr_pages);
+       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
         struct mem_cgroup *mem_over_limit;
         struct res_counter *fail_res;
+       unsigned long nr_reclaimed;
         unsigned long flags = 0;
-       int ret;
+       unsigned long long size;
+       int ret = 0;
  
-       ret = res_counter_charge(&memcg->res, csize, &fail_res);
+retry:
+       if (consume_stock(memcg, nr_pages))
+               goto done;
  
-       if (likely(!ret)) {
+       size = batch * PAGE_SIZE;
+       if (!res_counter_charge(&memcg->res, size, &fail_res)) {
                 if (!do_swap_account)
-                       return CHARGE_OK;
-               ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
-               if (likely(!ret))
-                       return CHARGE_OK;
-
-               res_counter_uncharge(&memcg->res, csize);
+                       goto done_restock;
+               if (!res_counter_charge(&memcg->memsw, size, &fail_res))
+                       goto done_restock;
+               res_counter_uncharge(&memcg->res, size);
                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
         } else
                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+       if (batch > nr_pages) {
+               batch = nr_pages;
+               goto retry;
+       }
+
         /*
-        * Never reclaim on behalf of optional batching, retry with a
-        * single page instead.
+        * Unlike in global OOM situations, memcg is not in a physical
+        * memory shortage.  Allow dying and OOM-killed tasks to
+        * bypass the last charges so that they can exit quickly and
+        * free their memory.
          */
-       if (nr_pages > min_pages)
-               return CHARGE_RETRY;
+       if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+                    fatal_signal_pending(current) ||
+                    current->flags & PF_EXITING))
+               goto bypass;
+
+       if (unlikely(task_in_memcg_oom(current)))
+               goto nomem;
  
         if (!(gfp_mask & __GFP_WAIT))
-               return CHARGE_WOULDBLOCK;
+               goto nomem;
  
-       if (gfp_mask & __GFP_NORETRY)
-               return CHARGE_NOMEM;
+       nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
  
-       ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
-               return CHARGE_RETRY;
+               goto retry;
+
+       if (gfp_mask & __GFP_NORETRY)
+               goto nomem;
         /*
          * Even though the limit is exceeded at this point, reclaim
          * may have been able to free some pages.  Retry the charge
@@ -2609,142 +2617,47 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
          * unlikely to succeed so close to the limit, and we fall back
          * to regular pages anyway in case of failure.
          */
-       if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
-               return CHARGE_RETRY;
-
+       if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+               goto retry;
         /*
          * At task move, charge accounts can be doubly counted. So, it's
          * better to wait until the end of task_move if something is going on.
          */
         if (mem_cgroup_wait_acct_move(mem_over_limit))
-               return CHARGE_RETRY;
-
-       if (invoke_oom)
-               mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
-
-       return CHARGE_NOMEM;
-}
-
-/**
- * mem_cgroup_try_charge - try charging a memcg
- * @memcg: memcg to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns 0 if @memcg was charged successfully, -EINTR if the charge
- * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
- */
-static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
-                                gfp_t gfp_mask,
-                                unsigned int nr_pages,
-                                bool oom)
-{
-       unsigned int batch = max(CHARGE_BATCH, nr_pages);
-       int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-       int ret;
-
-       if (mem_cgroup_is_root(memcg))
-               goto done;
-       /*
-        * Unlike in global OOM situations, memcg is not in a physical
-        * memory shortage.  Allow dying and OOM-killed tasks to
-        * bypass the last charges so that they can exit quickly and
-        * free their memory.
-        */
-       if (unlikely(test_thread_flag(TIF_MEMDIE) ||
-                    fatal_signal_pending(current) ||
-                    current->flags & PF_EXITING))
-               goto bypass;
+               goto retry;
  
-       if (unlikely(task_in_memcg_oom(current)))
-               goto nomem;
+       if (nr_retries--)
+               goto retry;
  
         if (gfp_mask & __GFP_NOFAIL)
-               oom = false;
-again:
-       if (consume_stock(memcg, nr_pages))
-               goto done;
-
-       do {
-               bool invoke_oom = oom && !nr_oom_retries;
-
-               /* If killed, bypass charge */
-               if (fatal_signal_pending(current))
-                       goto bypass;
+               goto bypass;
  
-               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
-                                          nr_pages, invoke_oom);
-               switch (ret) {
-               case CHARGE_OK:
-                       break;
-               case CHARGE_RETRY: /* not in OOM situation but retry */
-                       batch = nr_pages;
-                       goto again;
-               case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-                       goto nomem;
-               case CHARGE_NOMEM: /* OOM routine works */
-                       if (!oom || invoke_oom)
-                               goto nomem;
-                       nr_oom_retries--;
-                       break;
-               }
-       } while (ret != CHARGE_OK);
+       if (fatal_signal_pending(current))
+               goto bypass;
  
-       if (batch > nr_pages)
-               refill_stock(memcg, batch - nr_pages);
-done:
-       return 0;
+       mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
  nomem:
         if (!(gfp_mask & __GFP_NOFAIL))
                 return -ENOMEM;
  bypass:
-       return -EINTR;
-}
-
-/**
- * mem_cgroup_try_charge_mm - try charging a mm
- * @mm: mm_struct to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns the charged mem_cgroup associated with the given mm_struct or
- * NULL the charge failed.
- */
-static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
-                                gfp_t gfp_mask,
-                                unsigned int nr_pages,
-                                bool oom)
+       memcg = root_mem_cgroup;
+       ret = -EINTR;
+       goto retry;
  
-{
-       struct mem_cgroup *memcg;
-       int ret;
-
-       memcg = get_mem_cgroup_from_mm(mm);
-       ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
-       css_put(&memcg->css);
-       if (ret == -EINTR)
-               memcg = root_mem_cgroup;
-       else if (ret)
-               memcg = NULL;
-
-       return memcg;
+done_restock:
+       if (batch > nr_pages)
+               refill_stock(memcg, batch - nr_pages);
+done:
+       return ret;
  }
  
-/*
- * Somemtimes we have to undo a charge we got by try_charge().
- * This function is for that and do uncharge, put css's refcnt.
- * gotten by try_charge().
- */
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
-                                      unsigned int nr_pages)
+static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
-       if (!mem_cgroup_is_root(memcg)) {
-               unsigned long bytes = nr_pages * PAGE_SIZE;
+       unsigned long bytes = nr_pages * PAGE_SIZE;
  
-               res_counter_uncharge(&memcg->res, bytes);
-               if (do_swap_account)
-                       res_counter_uncharge(&memcg->memsw, bytes);
-       }
+       res_counter_uncharge(&memcg->res, bytes);
+       if (do_swap_account)
+               res_counter_uncharge(&memcg->memsw, bytes);
  }
  
  /*
@@ -2756,9 +2669,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
  {
         unsigned long bytes = nr_pages * PAGE_SIZE;
  
-       if (mem_cgroup_is_root(memcg))
-               return;
-
         res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
         if (do_swap_account)
                 res_counter_uncharge_until(&memcg->memsw,
@@ -2807,17 +2717,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
         return memcg;
  }
  
-static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
-                                      struct page *page,
-                                      unsigned int nr_pages,
-                                      enum charge_type ctype,
-                                      bool lrucare)
+static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+                         unsigned int nr_pages, bool anon, bool lrucare)
  {
         struct page_cgroup *pc = lookup_page_cgroup(page);
         struct zone *uninitialized_var(zone);
         struct lruvec *lruvec;
         bool was_on_lru = false;
-       bool anon;
  
         lock_page_cgroup(pc);
         VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
@@ -2842,14 +2748,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         }
  
         pc->mem_cgroup = memcg;
-       /*
-        * We access a page_cgroup asynchronously without lock_page_cgroup().
-        * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
-        * is accessed after testing USED bit. To make pc->mem_cgroup visible
-        * before USED bit, we need memory barrier here.
-        * See mem_cgroup_add_lru_list(), etc.
-        */
-       smp_wmb();
         SetPageCgroupUsed(pc);
  
         if (lrucare) {
@@ -2862,11 +2760,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                 spin_unlock_irq(&zone->lru_lock);
         }
  
-       if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
-               anon = true;
-       else
-               anon = false;
-
         mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
         unlock_page_cgroup(pc);
  
@@ -2937,22 +2830,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
         if (ret)
                 return ret;
  
-       ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
-                                   oom_gfp_allowed(gfp));
+       ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
         if (ret == -EINTR)  {
                 /*
-                * mem_cgroup_try_charge() chosed to bypass to root due to
-                * OOM kill or fatal signal.  Since our only options are to
-                * either fail the allocation or charge it to this cgroup, do
-                * it as a temporary condition. But we can't fail. From a
-                * kmem/slab perspective, the cache has already been selected,
-                * by mem_cgroup_kmem_get_cache(), so it is too late to change
+                * try_charge() chose to bypass to root due to OOM kill or
+                * fatal signal.  Since our only options are to either fail
+                * the allocation or charge it to this cgroup, do it as a
+                * temporary condition. But we can't fail. From a kmem/slab
+                * perspective, the cache has already been selected, by
+                * mem_cgroup_kmem_get_cache(), so it is too late to change
                  * our minds.
                  *
                  * This condition will only trigger if the task entered
-                * memcg_charge_kmem in a sane state, but was OOM-killed during
-                * mem_cgroup_try_charge() above. Tasks that were already
-                * dying when the allocation triggers should have been already
+                * memcg_charge_kmem in a sane state, but was OOM-killed
+                * during try_charge() above. Tasks that were already dying
+                * when the allocation triggers should have been already
                  * directed to the root cgroup in memcontrol.h
                  */
                 res_counter_charge_nofail(&memcg->res, size, &fail_res);
@@ -3463,12 +3355,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
                 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
                 return;
         }
-
+       /*
+        * The page is freshly allocated and not visible to any
+        * outside callers yet.  Set up pc non-atomically.
+        */
         pc = lookup_page_cgroup(page);
-       lock_page_cgroup(pc);
         pc->mem_cgroup = memcg;
-       SetPageCgroupUsed(pc);
-       unlock_page_cgroup(pc);
+       pc->flags = PCG_USED;
  }
  
  void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -3478,19 +3371,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
  
  
         pc = lookup_page_cgroup(page);
-       /*
-        * Fast unlocked return. Theoretically might have changed, have to
-        * check again after locking.
-        */
         if (!PageCgroupUsed(pc))
                 return;
  
-       lock_page_cgroup(pc);
-       if (PageCgroupUsed(pc)) {
-               memcg = pc->mem_cgroup;
-               ClearPageCgroupUsed(pc);
-       }
-       unlock_page_cgroup(pc);
+       memcg = pc->mem_cgroup;
+       pc->flags = 0;
  
         /*
          * We trust that only if there is a memcg associated with the page, it
@@ -3531,7 +3416,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
         for (i = 1; i < HPAGE_PMD_NR; i++) {
                 pc = head_pc + i;
                 pc->mem_cgroup = memcg;
-               smp_wmb();/* see __commit_charge() */
                 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
         }
         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
@@ -3682,170 +3566,6 @@ out:
         return ret;
  }
  
-int mem_cgroup_charge_anon(struct page *page,
-                             struct mm_struct *mm, gfp_t gfp_mask)
-{
-       unsigned int nr_pages = 1;
-       struct mem_cgroup *memcg;
-       bool oom = true;
-
-       if (mem_cgroup_disabled())
-               return 0;
-
-       VM_BUG_ON_PAGE(page_mapped(page), page);
-       VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-       VM_BUG_ON(!mm);
-
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-               /*
-                * Never OOM-kill a process for a huge page.  The
-                * fault handler will fall back to regular pages.
-                */
-               oom = false;
-       }
-
-       memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
-       if (!memcg)
-               return -ENOMEM;
-       __mem_cgroup_commit_charge(memcg, page, nr_pages,
-                                  MEM_CGROUP_CHARGE_TYPE_ANON, false);
-       return 0;
-}
-
-/*
- * While swap-in, try_charge -> commit or cancel, the page is locked.
- * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is acquired. This refcnt will be consumed by
- * "commit()" or removed by "cancel()"
- */
-static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-                                         struct page *page,
-                                         gfp_t mask,
-                                         struct mem_cgroup **memcgp)
-{
-       struct mem_cgroup *memcg = NULL;
-       struct page_cgroup *pc;
-       int ret;
-
-       pc = lookup_page_cgroup(page);
-       /*
-        * Every swap fault against a single page tries to charge the
-        * page, bail as early as possible.  shmem_unuse() encounters
-        * already charged pages, too.  The USED bit is protected by
-        * the page lock, which serializes swap cache removal, which
-        * in turn serializes uncharging.
-        */
-       if (PageCgroupUsed(pc))
-               goto out;
-       if (do_swap_account)
-               memcg = try_get_mem_cgroup_from_page(page);
-       if (!memcg)
-               memcg = get_mem_cgroup_from_mm(mm);
-       ret = mem_cgroup_try_charge(memcg, mask, 1, true);
-       css_put(&memcg->css);
-       if (ret == -EINTR)
-               memcg = root_mem_cgroup;
-       else if (ret)
-               return ret;
-out:
-       *memcgp = memcg;
-       return 0;
-}
-
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
-                                gfp_t gfp_mask, struct mem_cgroup **memcgp)
-{
-       if (mem_cgroup_disabled()) {
-               *memcgp = NULL;
-               return 0;
-       }
-       /*
-        * A racing thread's fault, or swapoff, may have already
-        * updated the pte, and even removed page from swap cache: in
-        * those cases unuse_pte()'s pte_same() test will fail; but
-        * there's also a KSM case which does need to charge the page.
-        */
-       if (!PageSwapCache(page)) {
-               struct mem_cgroup *memcg;
-
-               memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
-               if (!memcg)
-                       return -ENOMEM;
-               *memcgp = memcg;
-               return 0;
-       }
-       return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
-}
-
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
-{
-       if (mem_cgroup_disabled())
-               return;
-       if (!memcg)
-               return;
-       __mem_cgroup_cancel_charge(memcg, 1);
-}
-
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
-                                       enum charge_type ctype)
-{
-       if (mem_cgroup_disabled())
-               return;
-       if (!memcg)
-               return;
-
-       __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
-       /*
-        * Now swap is on-memory. This means this page may be
-        * counted both as mem and swap....double count.
-        * Fix it by uncharging from memsw. Basically, this SwapCache is stable
-        * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
-        * may call delete_from_swap_cache() before reach here.
-        */
-       if (do_swap_account && PageSwapCache(page)) {
-               swp_entry_t ent = {.val = page_private(page)};
-               mem_cgroup_uncharge_swap(ent);
-       }
-}
-
-void mem_cgroup_commit_charge_swapin(struct page *page,
-                                    struct mem_cgroup *memcg)
-{
-       __mem_cgroup_commit_charge_swapin(page, memcg,
-                                         MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
-int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
-                               gfp_t gfp_mask)
-{
-       enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-       struct mem_cgroup *memcg;
-       int ret;
-
-       if (mem_cgroup_disabled())
-               return 0;
-       if (PageCompound(page))
-               return 0;
-
-       if (PageSwapCache(page)) { /* shmem */
-               ret = __mem_cgroup_try_charge_swapin(mm, page,
-                                                    gfp_mask, &memcg);
-               if (ret)
-                       return ret;
-               __mem_cgroup_commit_charge_swapin(page, memcg, type);
-               return 0;
-       }
-
-       memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
-       if (!memcg)
-               return -ENOMEM;
-       __mem_cgroup_commit_charge(memcg, page, 1, type, false);
-       return 0;
-}
-
  static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
                                    unsigned int nr_pages,
                                    const enum charge_type ctype)
@@ -3993,7 +3713,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
          * replacement page, so leave it alone when phasing out the
          * page that is unused after the migration.
          */
-       if (!end_migration && !mem_cgroup_is_root(memcg))
+       if (!end_migration)
                 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
  
         return memcg;
@@ -4126,8 +3846,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
                  * We uncharge this because swap is freed.  This memcg can
                  * be obsolete one. We avoid calling css_tryget_online().
                  */
-               if (!mem_cgroup_is_root(memcg))
-                       res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+               res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
                 mem_cgroup_swap_statistics(memcg, false);
                 css_put(&memcg->css);
         }
@@ -4193,7 +3912,6 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
         struct mem_cgroup *memcg = NULL;
         unsigned int nr_pages = 1;
         struct page_cgroup *pc;
-       enum charge_type ctype;
  
         *memcgp = NULL;
  
@@ -4255,16 +3973,12 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
          * page. In the case new page is migrated but not remapped, new page's
          * mapcount will be finally 0 and we call uncharge in end_migration().
          */
-       if (PageAnon(page))
-               ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-       else
-               ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
         /*
          * The page is committed to the memcg, but it's not actually
          * charged to the res_counter since we plan on replacing the
          * old one and only one page is going to be left afterwards.
          */
-       __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
+       commit_charge(newpage, memcg, nr_pages, PageAnon(page), false);
  }
  
  /* remove redundant charge if migration failed*/
@@ -4323,7 +4037,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
  {
         struct mem_cgroup *memcg = NULL;
         struct page_cgroup *pc;
-       enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
  
         if (mem_cgroup_disabled())
                 return;
@@ -4349,7 +4062,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
          * the newpage may be on LRU(or pagevec for LRU) already. We lock
          * LRU while we overwrite pc->mem_cgroup.
          */
-       __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
+       commit_charge(newpage, memcg, 1, false, true);
  }
  
  #ifdef CONFIG_DEBUG_VM
@@ -4817,78 +4530,24 @@ out:
         return retval;
  }
  
-
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
-                                              enum mem_cgroup_stat_index idx)
-{
-       struct mem_cgroup *iter;
-       long val = 0;
-
-       /* Per-cpu values can be negative, use a signed accumulator */
-       for_each_mem_cgroup_tree(iter, memcg)
-               val += mem_cgroup_read_stat(iter, idx);
-
-       if (val < 0) /* race ? */
-               val = 0;
-       return val;
-}
-
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
-{
-       u64 val;
-
-       if (!mem_cgroup_is_root(memcg)) {
-               if (!swap)
-                       return res_counter_read_u64(&memcg->res, RES_USAGE);
-               else
-                       return res_counter_read_u64(&memcg->memsw, RES_USAGE);
-       }
-
-       /*
-        * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
-        * as well as in MEM_CGROUP_STAT_RSS_HUGE.
-        */
-       val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-       val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
-
-       if (swap)
-               val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
-
-       return val << PAGE_SHIFT;
-}
-
  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
-                                  struct cftype *cft)
+                              struct cftype *cft)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-       u64 val;
-       int name;
-       enum res_type type;
-
-       type = MEMFILE_TYPE(cft->private);
-       name = MEMFILE_ATTR(cft->private);
+       enum res_type type = MEMFILE_TYPE(cft->private);
+       int name = MEMFILE_ATTR(cft->private);
  
         switch (type) {
         case _MEM:
-               if (name == RES_USAGE)
-                       val = mem_cgroup_usage(memcg, false);
-               else
-                       val = res_counter_read_u64(&memcg->res, name);
-               break;
+               return res_counter_read_u64(&memcg->res, name);
         case _MEMSWAP:
-               if (name == RES_USAGE)
-                       val = mem_cgroup_usage(memcg, true);
-               else
-                       val = res_counter_read_u64(&memcg->memsw, name);
-               break;
+               return res_counter_read_u64(&memcg->memsw, name);
         case _KMEM:
-               val = res_counter_read_u64(&memcg->kmem, name);
+               return res_counter_read_u64(&memcg->kmem, name);
                 break;
         default:
                 BUG();
         }
-
-       return val;
  }
  
  #ifdef CONFIG_MEMCG_KMEM
@@ -5350,7 +5009,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
         if (!t)
                 goto unlock;
  
-       usage = mem_cgroup_usage(memcg, swap);
+       if (!swap)
+               usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+       else
+               usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
  
         /*
          * current_threshold points to threshold just below or equal to usage.
@@ -5446,15 +5108,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
  
         mutex_lock(&memcg->thresholds_lock);
  
-       if (type == _MEM)
+       if (type == _MEM) {
                 thresholds = &memcg->thresholds;
-       else if (type == _MEMSWAP)
+               usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+       } else if (type == _MEMSWAP) {
                 thresholds = &memcg->memsw_thresholds;
-       else
+               usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+       } else
                 BUG();
  
-       usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
-
         /* Check if a threshold crossed before adding a new one */
         if (thresholds->primary)
                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@ -5534,18 +5196,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
         int i, j, size;
  
         mutex_lock(&memcg->thresholds_lock);
-       if (type == _MEM)
+
+       if (type == _MEM) {
                 thresholds = &memcg->thresholds;
-       else if (type == _MEMSWAP)
+               usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+       } else if (type == _MEMSWAP) {
                 thresholds = &memcg->memsw_thresholds;
-       else
+               usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+       } else
                 BUG();
  
         if (!thresholds->primary)
                 goto unlock;
  
-       usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
-
         /* Check if a threshold crossed before removing */
         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
@@ -6299,9 +5962,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                  * core guarantees its existence.
                  */
         } else {
-               res_counter_init(&memcg->res, NULL);
-               res_counter_init(&memcg->memsw, NULL);
-               res_counter_init(&memcg->kmem, NULL);
+               res_counter_init(&memcg->res, &root_mem_cgroup->res);
+               res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
+               res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
                 /*
                  * Deeper hierachy with use_hierarchy == false doesn't make
                  * much sense so let cgroup subsystem know about this
@@ -6435,55 +6098,38 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  
  #ifdef CONFIG_MMU
  /* Handlers for move charge at task migration. */
-#define PRECHARGE_COUNT_AT_ONCE        256
  static int mem_cgroup_do_precharge(unsigned long count)
  {
-       int ret = 0;
-       int batch_count = PRECHARGE_COUNT_AT_ONCE;
-       struct mem_cgroup *memcg = mc.to;
+       int ret;
  
-       if (mem_cgroup_is_root(memcg)) {
+       /* Try a single bulk charge without reclaim first */
+       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+       if (!ret) {
                 mc.precharge += count;
-               /* we don't need css_get for root */
                 return ret;
         }
-       /* try to charge at once */
-       if (count > 1) {
-               struct res_counter *dummy;
-               /*
-                * "memcg" cannot be under rmdir() because we've already checked
-                * by cgroup_lock_live_cgroup() that it is not removed and we
-                * are still under the same cgroup_mutex. So we can postpone
-                * css_get().
-                */
-               if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
-                       goto one_by_one;
-               if (do_swap_account && res_counter_charge(&memcg->memsw,
-                                               PAGE_SIZE * count, &dummy)) {
-                       res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
-                       goto one_by_one;
-               }
-               mc.precharge += count;
+       if (ret == -EINTR) {
+               cancel_charge(root_mem_cgroup, count);
                 return ret;
         }
-one_by_one:
-       /* fall back to one by one charge */
+
+       /* Try charges one by one with reclaim */
         while (count--) {
-               if (signal_pending(current)) {
-                       ret = -EINTR;
-                       break;
-               }
-               if (!batch_count--) {
-                       batch_count = PRECHARGE_COUNT_AT_ONCE;
-                       cond_resched();
-               }
-               ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+               ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+               /*
+                * In case of failure, any residual charges against
+                * mc.to will be dropped by mem_cgroup_clear_mc()
+                * later on.  However, cancel any charges that are
+                * bypassed to root right away or they'll be lost.
+                */
+               if (ret == -EINTR)
+                       cancel_charge(root_mem_cgroup, 1);
                 if (ret)
-                       /* mem_cgroup_clear_mc() will do uncharge later */
                         return ret;
                 mc.precharge++;
+               cond_resched();
         }
-       return ret;
+       return 0;
  }
  
  /**
@@ -6746,7 +6392,7 @@ static void __mem_cgroup_clear_mc(void)
  
         /* we must uncharge all the leftover precharges from mc.to */
         if (mc.precharge) {
-               __mem_cgroup_cancel_charge(mc.to, mc.precharge);
+               cancel_charge(mc.to, mc.precharge);
                 mc.precharge = 0;
         }
         /*
@@ -6754,27 +6400,24 @@ static void __mem_cgroup_clear_mc(void)
          * we must uncharge here.
          */
         if (mc.moved_charge) {
-               __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+               cancel_charge(mc.from, mc.moved_charge);
                 mc.moved_charge = 0;
         }
         /* we must fixup refcnts and charges */
         if (mc.moved_swap) {
                 /* uncharge swap account from the old cgroup */
-               if (!mem_cgroup_is_root(mc.from))
-                       res_counter_uncharge(&mc.from->memsw,
-                                               PAGE_SIZE * mc.moved_swap);
+               res_counter_uncharge(&mc.from->memsw,
+                                    PAGE_SIZE * mc.moved_swap);
  
                 for (i = 0; i < mc.moved_swap; i++)
                         css_put(&mc.from->css);
  
-               if (!mem_cgroup_is_root(mc.to)) {
-                       /*
-                        * we charged both to->res and to->memsw, so we should
-                        * uncharge to->res.
-                        */
-                       res_counter_uncharge(&mc.to->res,
-                                               PAGE_SIZE * mc.moved_swap);
-               }
+               /*
+                * we charged both to->res and to->memsw, so we should
+                * uncharge to->res.
+                */
+               res_counter_uncharge(&mc.to->res,
+                                    PAGE_SIZE * mc.moved_swap);
                 /* we've already done css_get(mc.to) */
                 mc.moved_swap = 0;
         }
@@ -7086,6 +6729,150 @@ static void __init enable_swap_cgroup(void)
  }
  #endif
  
+/**
+ * mem_cgroup_try_charge - try charging a page
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ * @memcgp: charged memcg return
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+ * Otherwise, an error code is returned.
+ *
+ * After page->mapping has been set up, the caller must finalize the
+ * charge with mem_cgroup_commit_charge().  Or abort the transaction
+ * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ */
+int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+       struct mem_cgroup *memcg = NULL;
+       unsigned int nr_pages = 1;
+       int ret = 0;
+
+       if (mem_cgroup_disabled())
+               goto out;
+
+       if (PageSwapCache(page)) {
+               struct page_cgroup *pc = lookup_page_cgroup(page);
+               /*
+                * Every swap fault against a single page tries to charge the
+                * page, bail as early as possible.  shmem_unuse() encounters
+                * already charged pages, too.  The USED bit is protected by
+                * the page lock, which serializes swap cache removal, which
+                * in turn serializes uncharging.
+                */
+               if (PageCgroupUsed(pc))
+                       goto out;
+       }
+
+       if (PageTransHuge(page)) {
+               nr_pages <<= compound_order(page);
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+       }
+
+       if (do_swap_account && PageSwapCache(page))
+               memcg = try_get_mem_cgroup_from_page(page);
+       if (!memcg)
+               memcg = get_mem_cgroup_from_mm(mm);
+
+       ret = try_charge(memcg, gfp_mask, nr_pages);
+
+       css_put(&memcg->css);
+
+       if (ret == -EINTR) {
+               memcg = root_mem_cgroup;
+               ret = 0;
+       }
+out:
+       *memcgp = memcg;
+       return ret;
+}
+
+/**
+ * mem_cgroup_commit_charge - commit a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ * @lrucare: page might be on LRU already
+ *
+ * Finalize a charge transaction started by mem_cgroup_try_charge(),
+ * after page->mapping has been set up.  This must happen atomically
+ * as part of the page instantiation, i.e. under the page table lock
+ * for anonymous pages, under the page lock for page and swap cache.
+ *
+ * In addition, the page must not be on the LRU during the commit, to
+ * prevent racing with task migration.  If it might be, use @lrucare.
+ *
+ * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+ */
+void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+                             bool lrucare)
+{
+       unsigned int nr_pages = 1;
+
+       VM_BUG_ON_PAGE(!page->mapping, page);
+       VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+
+       if (mem_cgroup_disabled())
+               return;
+       /*
+        * Swap faults will attempt to charge the same page multiple
+        * times.  But reuse_swap_page() might have removed the page
+        * from swapcache already, so we can't check PageSwapCache().
+        */
+       if (!memcg)
+               return;
+
+       if (PageTransHuge(page)) {
+               nr_pages <<= compound_order(page);
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+       }
+
+       commit_charge(page, memcg, nr_pages, PageAnon(page), lrucare);
+
+       if (do_swap_account && PageSwapCache(page)) {
+               swp_entry_t entry = { .val = page_private(page) };
+               /*
+                * The swap entry might not get freed for a long time,
+                * let's not wait for it.  The page already received a
+                * memory+swap charge, drop the swap entry duplicate.
+                */
+               mem_cgroup_uncharge_swap(entry);
+       }
+}
+
+/**
+ * mem_cgroup_cancel_charge - cancel a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ *
+ * Cancel a charge transaction started by mem_cgroup_try_charge().
+ */
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+{
+       unsigned int nr_pages = 1;
+
+       if (mem_cgroup_disabled())
+               return;
+       /*
+        * Swap faults will attempt to charge the same page multiple
+        * times.  But reuse_swap_page() might have removed the page
+        * from swapcache already, so we can't check PageSwapCache().
+        */
+       if (!memcg)
+               return;
+
+       if (PageTransHuge(page)) {
+               nr_pages <<= compound_order(page);
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+       }
+
+       cancel_charge(memcg, nr_pages);
+}
+
  /*
   * subsys_initcall() for memory controller.
   *