scsi: fix host max depth checking for the 'queue_depth' sysfs interface
[cascardo/linux.git] / mm / memcontrol.c
index a04225d..acb93c5 100644 (file)
@@ -77,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 #define MEM_CGROUP_RECLAIM_RETRIES     5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
@@ -90,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = {
        "rss",
        "rss_huge",
        "mapped_file",
+       "dirty",
        "writeback",
        "swap",
 };
@@ -285,9 +287,9 @@ struct mem_cgroup {
         */
        bool use_hierarchy;
 
+       /* protected by memcg_oom_lock */
        bool            oom_lock;
-       atomic_t        under_oom;
-       atomic_t        oom_wakeups;
+       int             under_oom;
 
        int     swappiness;
        /* OOM-Killer disable */
@@ -322,11 +324,6 @@ struct mem_cgroup {
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -346,6 +343,11 @@ struct mem_cgroup {
        atomic_t        numainfo_updating;
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+#endif
+
        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
@@ -596,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
        return &memcg->css;
 }
 
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned.  The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+       struct mem_cgroup *memcg;
+
+       rcu_read_lock();
+
+       memcg = page->mem_cgroup;
+
+       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+               memcg = root_mem_cgroup;
+
+       rcu_read_unlock();
+       return &memcg->css;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -795,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
        long val = 0;
        int cpu;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
        return val;
 }
 
@@ -813,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        unsigned long val = 0;
        int cpu;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
        return val;
 }
 
@@ -1530,14 +1551,16 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        unsigned int points = 0;
        struct task_struct *chosen = NULL;
 
+       mutex_lock(&oom_lock);
+
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-               mark_tsk_oom_victim(current);
-               return;
+               mark_oom_victim(current);
+               goto unlock;
        }
 
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@ -1564,7 +1587,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                mem_cgroup_iter_break(memcg, iter);
                                if (chosen)
                                        put_task_struct(chosen);
-                               return;
+                               goto unlock;
                        case OOM_SCAN_OK:
                                break;
                        };
@@ -1585,11 +1608,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                css_task_iter_end(&it);
        }
 
-       if (!chosen)
-               return;
-       points = chosen_points * 1000 / totalpages;
-       oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
-                        NULL, "Memory cgroup out of memory");
+       if (chosen) {
+               points = chosen_points * 1000 / totalpages;
+               oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+                                memcg, NULL, "Memory cgroup out of memory");
+       }
+unlock:
+       mutex_unlock(&oom_lock);
 }
 
 #if MAX_NUMNODES > 1
@@ -1806,8 +1831,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
 
+       spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
-               atomic_inc(&iter->under_oom);
+               iter->under_oom++;
+       spin_unlock(&memcg_oom_lock);
 }
 
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1816,11 +1843,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 
        /*
         * When a new child is created while the hierarchy is under oom,
-        * mem_cgroup_oom_lock() may not be called. We have to use
-        * atomic_add_unless() here.
+        * mem_cgroup_oom_lock() may not be called. Watch for underflow.
         */
+       spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
-               atomic_add_unless(&iter->under_oom, -1, 0);
+               if (iter->under_oom > 0)
+                       iter->under_oom--;
+       spin_unlock(&memcg_oom_lock);
 }
 
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1846,17 +1875,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
        return autoremove_wake_function(wait, mode, sync, arg);
 }
 
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
-       atomic_inc(&memcg->oom_wakeups);
-       /* for filtering, pass "memcg" as argument. */
-       __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
-       if (memcg && atomic_read(&memcg->under_oom))
-               memcg_wakeup_oom(memcg);
+       /*
+        * For the following lockless ->under_oom test, the only required
+        * guarantee is that it must see the state asserted by an OOM when
+        * this function is called as a result of userland actions
+        * triggered by the notification of the OOM.  This is trivially
+        * achieved by invoking mem_cgroup_mark_under_oom() before
+        * triggering notification.
+        */
+       if (memcg && memcg->under_oom)
+               __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -2011,6 +2041,7 @@ again:
 
        return memcg;
 }
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
 
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2029,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 
        rcu_read_unlock();
 }
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
 /**
  * mem_cgroup_update_page_stat - update page state statistics
@@ -2169,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
        mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-       int i;
-
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
-
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
 
        if (action == CPU_ONLINE)
                return NOTIFY_OK;
@@ -2207,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
 
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
-
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@ -3864,7 +3868,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
        list_add(&event->list, &memcg->oom_notify);
 
        /* already in OOM ? */
-       if (atomic_read(&memcg->under_oom))
+       if (memcg->under_oom)
                eventfd_signal(eventfd, 1);
        spin_unlock(&memcg_oom_lock);
 
@@ -3893,7 +3897,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 
        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
-       seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+       seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
        return 0;
 }
 
@@ -3995,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 }
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+       return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+       wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+       wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+       if (!memcg->css.parent)
+               return NULL;
+
+       return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used".  The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages.  Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+       unsigned long head_room = PAGE_COUNTER_MAX;
+       unsigned long file_pages;
+
+       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+       /* this should eventually include NR_UNSTABLE_NFS */
+       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                   (1 << LRU_ACTIVE_FILE));
+       while ((parent = parent_mem_cgroup(memcg))) {
+               unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+               unsigned long used = page_counter_read(&memcg->memory);
+
+               head_room = min(head_room, ceiling - min(ceiling, used));
+               memcg = parent;
+       }
+
+       *pavail = file_pages + head_room;
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * DO NOT USE IN NEW FILES.
  *
@@ -4379,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!memcg->stat)
                goto out_free;
+
+       if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+               goto out_free_stat;
+
        spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
 
+out_free_stat:
+       free_percpu(memcg->stat);
 out_free:
        kfree(memcg);
        return NULL;
@@ -4408,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
 
        free_percpu(memcg->stat);
+       memcg_wb_domain_exit(memcg);
        kfree(memcg);
 }
 
@@ -4440,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        /* root ? */
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
+               mem_cgroup_root_css = &memcg->css;
                page_counter_init(&memcg->memory, NULL);
                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4458,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
 #endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
        return &memcg->css;
 
 free_out:
@@ -4546,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        vmpressure_cleanup(&memcg->vmpressure);
 
        memcg_deactivate_kmem(memcg);
+
+       wb_memcg_offline(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4579,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg->low = 0;
        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
+       memcg_wb_domain_size_changed(memcg);
 }
 
 #ifdef CONFIG_MMU
@@ -4748,6 +4857,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        unsigned long flags;
        int ret;
+       bool anon;
 
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4773,15 +4883,33 @@ static int mem_cgroup_move_account(struct page *page,
        if (page->mem_cgroup != from)
                goto out_unlock;
 
+       anon = PageAnon(page);
+
        spin_lock_irqsave(&from->move_lock, flags);
 
-       if (!PageAnon(page) && page_mapped(page)) {
+       if (!anon && page_mapped(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
        }
 
+       /*
+        * move_lock grabbed above and caller set from->moving_account, so
+        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * So mapping should be stable for dirty pages.
+        */
+       if (!anon && PageDirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+
+               if (mapping_cap_account_dirty(mapping)) {
+                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+               }
+       }
+
        if (PageWriteback(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
                               nr_pages);
@@ -5297,6 +5425,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
        memcg->high = high;
 
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }
 
@@ -5329,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }