b43: move under broadcom vendor directory
[cascardo/linux.git] / mm / memcontrol.c
index c57c442..9acfb16 100644 (file)
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
+#include <linux/tracehook.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -434,7 +435,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 
        memcg = page->mem_cgroup;
 
-       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+       if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                memcg = root_mem_cgroup;
 
        rcu_read_unlock();
@@ -1661,7 +1662,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-       if (!current->memcg_oom.may_oom)
+       if (!current->memcg_may_oom)
                return;
        /*
         * We are in the middle of the charge context here, so we
@@ -1678,9 +1679,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
         * and when we know whether the fault was overall successful.
         */
        css_get(&memcg->css);
-       current->memcg_oom.memcg = memcg;
-       current->memcg_oom.gfp_mask = mask;
-       current->memcg_oom.order = order;
+       current->memcg_in_oom = memcg;
+       current->memcg_oom_gfp_mask = mask;
+       current->memcg_oom_order = order;
 }
 
 /**
@@ -1702,7 +1703,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-       struct mem_cgroup *memcg = current->memcg_oom.memcg;
+       struct mem_cgroup *memcg = current->memcg_in_oom;
        struct oom_wait_info owait;
        bool locked;
 
@@ -1730,8 +1731,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
        if (locked && !memcg->oom_kill_disable) {
                mem_cgroup_unmark_under_oom(memcg);
                finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-                                        current->memcg_oom.order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+                                        current->memcg_oom_order);
        } else {
                schedule();
                mem_cgroup_unmark_under_oom(memcg);
@@ -1748,7 +1749,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
                memcg_oom_recover(memcg);
        }
 cleanup:
-       current->memcg_oom.memcg = NULL;
+       current->memcg_in_oom = NULL;
        css_put(&memcg->css);
        return true;
 }
@@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
 
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg, *pos;
+
+       if (likely(!nr_pages))
+               return;
+
+       pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+       do {
+               if (page_counter_read(&pos->memory) <= pos->high)
+                       continue;
+               mem_cgroup_events(pos, MEMCG_HIGH, 1);
+               try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+       } while ((pos = parent_mem_cgroup(pos)));
+
+       css_put(&memcg->css);
+       current->memcg_nr_pages_over_high = 0;
+}
+
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
 {
@@ -1982,17 +2008,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        unsigned long nr_reclaimed;
        bool may_swap = true;
        bool drained = false;
-       int ret = 0;
 
        if (mem_cgroup_is_root(memcg))
-               goto done;
+               return 0;
 retry:
        if (consume_stock(memcg, nr_pages))
-               goto done;
+               return 0;
 
        if (!do_swap_account ||
-           !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
-               if (!page_counter_try_charge(&memcg->memory, batch, &counter))
+           page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+               if (page_counter_try_charge(&memcg->memory, batch, &counter))
                        goto done_restock;
                if (do_swap_account)
                        page_counter_uncharge(&memcg->memsw, batch);
@@ -2016,12 +2041,12 @@ retry:
        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
-               goto bypass;
+               goto force;
 
        if (unlikely(task_in_memcg_oom(current)))
                goto nomem;
 
-       if (!(gfp_mask & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp_mask))
                goto nomem;
 
        mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
@@ -2062,38 +2087,54 @@ retry:
                goto retry;
 
        if (gfp_mask & __GFP_NOFAIL)
-               goto bypass;
+               goto force;
 
        if (fatal_signal_pending(current))
-               goto bypass;
+               goto force;
 
        mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
 
-       mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+       mem_cgroup_oom(mem_over_limit, gfp_mask,
+                      get_order(nr_pages * PAGE_SIZE));
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
                return -ENOMEM;
-bypass:
-       return -EINTR;
+force:
+       /*
+        * The allocation either can't fail or will lead to more memory
+        * being freed very soon.  Allow memory usage go over the limit
+        * temporarily by force charging it.
+        */
+       page_counter_charge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_charge(&memcg->memsw, nr_pages);
+       css_get_many(&memcg->css, nr_pages);
+
+       return 0;
 
 done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
-       if (!(gfp_mask & __GFP_WAIT))
-               goto done;
+
        /*
-        * If the hierarchy is above the normal consumption range,
-        * make the charging task trim their excess contribution.
+        * If the hierarchy is above the normal consumption range, schedule
+        * reclaim on returning to userland.  We can perform reclaim here
+        * if __GFP_RECLAIM but let's always punt for simplicity and so that
+        * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+        * not recorded as it most likely matches current's and won't
+        * change in the meantime.  As high limit is checked again before
+        * reclaim, the cost of mismatch is negligible.
         */
        do {
-               if (page_counter_read(&memcg->memory) <= memcg->high)
-                       continue;
-               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+               if (page_counter_read(&memcg->memory) > memcg->high) {
+                       current->memcg_nr_pages_over_high += nr_pages;
+                       set_notify_resume(current);
+                       break;
+               }
        } while ((memcg = parent_mem_cgroup(memcg)));
-done:
-       return ret;
+
+       return 0;
 }
 
 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2174,55 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages)
-{
-       struct page_counter *counter;
-       int ret = 0;
-
-       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-       if (ret < 0)
-               return ret;
-
-       ret = try_charge(memcg, gfp, nr_pages);
-       if (ret == -EINTR)  {
-               /*
-                * try_charge() chose to bypass to root due to OOM kill or
-                * fatal signal.  Since our only options are to either fail
-                * the allocation or charge it to this cgroup, do it as a
-                * temporary condition. But we can't fail. From a kmem/slab
-                * perspective, the cache has already been selected, by
-                * mem_cgroup_kmem_get_cache(), so it is too late to change
-                * our minds.
-                *
-                * This condition will only trigger if the task entered
-                * memcg_charge_kmem in a sane state, but was OOM-killed
-                * during try_charge() above. Tasks that were already dying
-                * when the allocation triggers should have been already
-                * directed to the root cgroup in memcontrol.h
-                */
-               page_counter_charge(&memcg->memory, nr_pages);
-               if (do_swap_account)
-                       page_counter_charge(&memcg->memsw, nr_pages);
-               css_get_many(&memcg->css, nr_pages);
-               ret = 0;
-       } else if (ret)
-               page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       if (do_swap_account)
-               page_counter_uncharge(&memcg->memsw, nr_pages);
-
-       page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       css_put_many(&memcg->css, nr_pages);
-}
-
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -2384,85 +2376,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
                css_put(&cachep->memcg_params.memcg->css);
 }
 
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer.  We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg)
 {
-       struct mem_cgroup *memcg;
+       unsigned int nr_pages = 1 << order;
+       struct page_counter *counter;
        int ret;
 
-       *_memcg = NULL;
+       if (!memcg_kmem_is_active(memcg))
+               return 0;
 
-       memcg = get_mem_cgroup_from_mm(current->mm);
+       if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+               return -ENOMEM;
 
-       if (!memcg_kmem_is_active(memcg)) {
-               css_put(&memcg->css);
-               return true;
+       ret = try_charge(memcg, gfp, nr_pages);
+       if (ret) {
+               page_counter_uncharge(&memcg->kmem, nr_pages);
+               return ret;
        }
 
-       ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-       if (!ret)
-               *_memcg = memcg;
+       page->mem_cgroup = memcg;
 
-       css_put(&memcg->css);
-       return (ret == 0);
+       return 0;
 }
 
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 {
-       VM_BUG_ON(mem_cgroup_is_root(memcg));
+       struct mem_cgroup *memcg;
+       int ret;
 
-       /* The page allocation failed. Revert */
-       if (!page) {
-               memcg_uncharge_kmem(memcg, 1 << order);
-               return;
-       }
-       page->mem_cgroup = memcg;
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       css_put(&memcg->css);
+       return ret;
 }
 
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
 {
        struct mem_cgroup *memcg = page->mem_cgroup;
+       unsigned int nr_pages = 1 << order;
 
        if (!memcg)
                return;
 
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 
-       memcg_uncharge_kmem(memcg, 1 << order);
-       page->mem_cgroup = NULL;
-}
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
-       struct mem_cgroup *memcg = NULL;
-       struct kmem_cache *cachep;
-       struct page *page;
-
-       page = virt_to_head_page(ptr);
-       if (PageSlab(page)) {
-               cachep = page->slab_cache;
-               if (!is_root_cache(cachep))
-                       memcg = cachep->memcg_params.memcg;
-       } else
-               /* page allocated by alloc_kmem_pages */
-               memcg = page->mem_cgroup;
+       page_counter_uncharge(&memcg->kmem, nr_pages);
+       page_counter_uncharge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_uncharge(&memcg->memsw, nr_pages);
 
-       return memcg;
+       page->mem_cgroup = NULL;
+       css_put_many(&memcg->css, nr_pages);
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
@@ -2836,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
        return val;
 }
 
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
-       u64 val;
+       unsigned long val;
 
        if (mem_cgroup_is_root(memcg)) {
                val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -2851,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
                else
                        val = page_counter_read(&memcg->memsw);
        }
-       return val << PAGE_SHIFT;
+       return val;
 }
 
 enum {
@@ -2885,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
        switch (MEMFILE_ATTR(cft->private)) {
        case RES_USAGE:
                if (counter == &memcg->memory)
-                       return mem_cgroup_usage(memcg, false);
+                       return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
                if (counter == &memcg->memsw)
-                       return mem_cgroup_usage(memcg, true);
+                       return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
                return (u64)page_counter_read(counter) * PAGE_SIZE;
        case RES_LIMIT:
                return (u64)counter->limit * PAGE_SIZE;
@@ -2926,7 +2891,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
         * of course permitted.
         */
        mutex_lock(&memcg_create_mutex);
-       if (cgroup_has_tasks(memcg->css.cgroup) ||
+       if (cgroup_is_populated(memcg->css.cgroup) ||
            (memcg->use_hierarchy && memcg_has_children(memcg)))
                err = -EBUSY;
        mutex_unlock(&memcg_create_mutex);
@@ -3387,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
        ret = page_counter_memparse(args, "-1", &threshold);
        if (ret)
                return ret;
-       threshold <<= PAGE_SHIFT;
 
        mutex_lock(&memcg->thresholds_lock);
 
@@ -4066,8 +4030,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
        {
                .name = "cgroup.event_control",         /* XXX: for compat */
                .write = memcg_write_event_control,
-               .flags = CFTYPE_NO_PREFIX,
-               .mode = S_IWUGO,
+               .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
        },
        {
                .name = "swappiness",
@@ -4401,28 +4364,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
 {
        int ret;
 
-       /* Try a single bulk charge without reclaim first */
-       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+       /* Try a single bulk charge without reclaim first, kswapd may wake */
+       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
        if (!ret) {
                mc.precharge += count;
                return ret;
        }
-       if (ret == -EINTR) {
-               cancel_charge(root_mem_cgroup, count);
-               return ret;
-       }
 
        /* Try charges one by one with reclaim */
        while (count--) {
                ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
-               /*
-                * In case of failure, any residual charges against
-                * mc.to will be dropped by mem_cgroup_clear_mc()
-                * later on.  However, cancel any charges that are
-                * bypassed to root right away or they'll be lost.
-                */
-               if (ret == -EINTR)
-                       cancel_charge(root_mem_cgroup, 1);
                if (ret)
                        return ret;
                mc.precharge++;
@@ -4577,9 +4528,8 @@ static int mem_cgroup_move_account(struct page *page,
                goto out;
 
        /*
-        * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-        * of its source page while we change it: page migration takes
-        * both pages off the LRU, but page cache replacement doesn't.
+        * Prevent mem_cgroup_replace_page() from looking at
+        * page->mem_cgroup of its source page while we change it.
         */
        if (!trylock_page(page))
                goto out;
@@ -4834,7 +4784,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *from;
-       struct task_struct *p;
+       struct task_struct *leader, *p;
        struct mm_struct *mm;
        unsigned long move_flags;
        int ret = 0;
@@ -4848,7 +4798,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
        if (!move_flags)
                return 0;
 
-       p = cgroup_taskset_first(tset);
+       /*
+        * Multi-process migrations only happen on the default hierarchy
+        * where charge immigration is not used.  Perform charge
+        * immigration if @tset contains a leader and whine if there are
+        * multiple.
+        */
+       p = NULL;
+       cgroup_taskset_for_each_leader(leader, tset) {
+               WARN_ON_ONCE(p);
+               p = leader;
+       }
+       if (!p)
+               return 0;
+
        from = mem_cgroup_from_task(p);
 
        VM_BUG_ON(from == memcg);
@@ -5064,7 +5027,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
         * guarantees that @root doesn't have any children, so turning it
         * on for the root memcg is enough.
         */
-       if (cgroup_on_dfl(root_css->cgroup))
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                root_mem_cgroup->use_hierarchy = true;
        else
                root_mem_cgroup->use_hierarchy = false;
@@ -5073,7 +5036,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 static u64 memory_current_read(struct cgroup_subsys_state *css,
                               struct cftype *cft)
 {
-       return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+       return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
 }
 
 static int memory_low_show(struct seq_file *m, void *v)
@@ -5185,6 +5150,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 static struct cftype memory_files[] = {
        {
                .name = "current",
+               .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = memory_current_read,
        },
        {
@@ -5208,6 +5174,7 @@ static struct cftype memory_files[] = {
        {
                .name = "events",
                .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct mem_cgroup, events_file),
                .seq_show = memory_events_show,
        },
        { }     /* terminate */
@@ -5327,11 +5294,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
        ret = try_charge(memcg, gfp_mask, nr_pages);
 
        css_put(&memcg->css);
-
-       if (ret == -EINTR) {
-               memcg = root_mem_cgroup;
-               ret = 0;
-       }
 out:
        *memcgp = memcg;
        return ret;
@@ -5546,7 +5508,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
 }
 
 /**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
  * @oldpage: currently charged page
  * @newpage: page to transfer the charge to
  * @lrucare: either or both pages might be on the LRU already
@@ -5555,16 +5517,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
  *
  * Both pages must be locked, @newpage->mapping must be set up.
  */
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
 {
        struct mem_cgroup *memcg;
        int isolated;
 
        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
                       newpage);
@@ -5576,25 +5535,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
        if (newpage->mem_cgroup)
                return;
 
-       /*
-        * Swapcache readahead pages can get migrated before being
-        * charged, and migration from compaction can happen to an
-        * uncharged page when the PFN walker finds a page that
-        * reclaim just put back on the LRU but has not released yet.
-        */
+       /* Swapcache readahead pages can get replaced before being charged */
        memcg = oldpage->mem_cgroup;
        if (!memcg)
                return;
 
-       if (lrucare)
-               lock_page_lru(oldpage, &isolated);
-
+       lock_page_lru(oldpage, &isolated);
        oldpage->mem_cgroup = NULL;
+       unlock_page_lru(oldpage, isolated);
 
-       if (lrucare)
-               unlock_page_lru(oldpage, isolated);
-
-       commit_charge(newpage, memcg, lrucare);
+       commit_charge(newpage, memcg, true);
 }
 
 /*