memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything
[cascardo/linux.git] / mm / memcontrol.c
index c016e00..848fc6c 100644 (file)
@@ -138,6 +138,7 @@ static const char * const mem_cgroup_lru_names[] = {
  */
 enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
+       MEM_CGROUP_TARGET_SOFTLIMIT,
        MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
@@ -315,6 +316,22 @@ struct mem_cgroup {
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;
 #endif
+       /*
+        * Protects soft_contributed transitions.
+        * See mem_cgroup_update_soft_limit
+        */
+       spinlock_t soft_lock;
+
+       /*
+        * If true then this group has increased parents' children_in_excess
+         * when it got over the soft limit.
+        * When a group falls bellow the soft limit, parents' children_in_excess
+        * is decreased and soft_contributed changed to false.
+        */
+       bool soft_contributed;
+
+       /* Number of children that are in soft limit excess */
+       atomic_t children_in_excess;
 
        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
@@ -802,6 +819,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
                case MEM_CGROUP_TARGET_THRESH:
                        next = val + THRESHOLDS_EVENTS_TARGET;
                        break;
+               case MEM_CGROUP_TARGET_SOFTLIMIT:
+                       next = val + SOFTLIMIT_EVENTS_TARGET;
+                       break;
                case MEM_CGROUP_TARGET_NUMAINFO:
                        next = val + NUMAINFO_EVENTS_TARGET;
                        break;
@@ -814,6 +834,42 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
        return false;
 }
 
+/*
+ * Called from rate-limitted memcg_check_events when enough
+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
+ * that all the parents up the hierarchy will be noticed that this group
+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
+ * makes the transition a single action whenever the state flips from one to
+ * other.
+ */
+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
+{
+       unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
+       struct mem_cgroup *parent = memcg;
+       int delta = 0;
+
+       spin_lock(&memcg->soft_lock);
+       if (excess) {
+               if (!memcg->soft_contributed) {
+                       delta = 1;
+                       memcg->soft_contributed = true;
+               }
+       } else {
+               if (memcg->soft_contributed) {
+                       delta = -1;
+                       memcg->soft_contributed = false;
+               }
+       }
+
+       /*
+        * Necessary to update all ancestors when hierarchy is used
+        * because their event counter is not touched.
+        */
+       while (delta && (parent = parent_mem_cgroup(parent)))
+               atomic_add(delta, &parent->children_in_excess);
+       spin_unlock(&memcg->soft_lock);
+}
+
 /*
  * Check events in order.
  *
@@ -824,8 +880,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
        /* threshold event is triggered in finer grain than soft limit */
        if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_THRESH))) {
+               bool do_softlimit;
                bool do_numainfo __maybe_unused;
 
+               do_softlimit = mem_cgroup_event_ratelimit(memcg,
+                                               MEM_CGROUP_TARGET_SOFTLIMIT);
 #if MAX_NUMNODES > 1
                do_numainfo = mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_NUMAINFO);
@@ -833,6 +892,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
                preempt_enable();
 
                mem_cgroup_threshold(memcg);
+               if (unlikely(do_softlimit))
+                       mem_cgroup_update_soft_limit(memcg);
 #if MAX_NUMNODES > 1
                if (unlikely(do_numainfo))
                        atomic_inc(&memcg->numainfo_events);
@@ -875,6 +936,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return memcg;
 }
 
+static enum mem_cgroup_filter_t
+mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
+               mem_cgroup_iter_filter cond)
+{
+       if (!cond)
+               return VISIT;
+       return cond(memcg, root);
+}
+
 /*
  * Returns a next (in a pre-order walk) alive memcg (with elevated css
  * ref. count) or NULL if the whole root's subtree has been visited.
@@ -882,7 +952,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
  * helper function to be used by mem_cgroup_iter
  */
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-               struct mem_cgroup *last_visited)
+               struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
 {
        struct cgroup_subsys_state *prev_css, *next_css;
 
@@ -900,11 +970,31 @@ skip_node:
        if (next_css) {
                struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
 
-               if (css_tryget(&mem->css))
-                       return mem;
-               else {
+               switch (mem_cgroup_filter(mem, root, cond)) {
+               case SKIP:
                        prev_css = next_css;
                        goto skip_node;
+               case SKIP_TREE:
+                       if (mem == root)
+                               return NULL;
+                       /*
+                        * css_rightmost_descendant is not an optimal way to
+                        * skip through a subtree (especially for imbalanced
+                        * trees leaning to right) but that's what we have right
+                        * now. More effective solution would be traversing
+                        * right-up for first non-NULL without calling
+                        * css_next_descendant_pre afterwards.
+                        */
+                       prev_css = css_rightmost_descendant(next_css);
+                       goto skip_node;
+               case VISIT:
+                       if (css_tryget(&mem->css))
+                               return mem;
+                       else {
+                               prev_css = next_css;
+                               goto skip_node;
+                       }
+                       break;
                }
        }
 
@@ -968,6 +1058,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ * @cond: filter for visited nodes, NULL for no filter
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
@@ -980,15 +1071,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
-                                  struct mem_cgroup_reclaim_cookie *reclaim)
+                                  struct mem_cgroup_reclaim_cookie *reclaim,
+                                  mem_cgroup_iter_filter cond)
 {
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *last_visited = NULL;
 
-       if (mem_cgroup_disabled())
-               return NULL;
+       if (mem_cgroup_disabled()) {
+               /* first call must return non-NULL, second return NULL */
+               return (struct mem_cgroup *)(unsigned long)!prev;
+       }
 
        if (!root)
                root = root_mem_cgroup;
@@ -999,7 +1093,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        if (!root->use_hierarchy && root != root_mem_cgroup) {
                if (prev)
                        goto out_css_put;
-               return root;
+               if (mem_cgroup_filter(root, root, cond) == VISIT)
+                       return root;
+               return NULL;
        }
 
        rcu_read_lock();
@@ -1022,7 +1118,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                        last_visited = mem_cgroup_iter_load(iter, root, &seq);
                }
 
-               memcg = __mem_cgroup_iter_next(root, last_visited);
+               memcg = __mem_cgroup_iter_next(root, last_visited, cond);
 
                if (reclaim) {
                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1033,7 +1129,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                reclaim->generation = iter->generation;
                }
 
-               if (prev && !memcg)
+               /*
+                * We have finished the whole tree walk or no group has been
+                * visited because filter told us to skip the root node.
+                */
+               if (!memcg && (prev || (cond && !last_visited)))
                        goto out_unlock;
        }
 out_unlock:
@@ -1777,14 +1877,22 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
  * hierarchy if
  *     a) it is over its soft limit
  *     b) any parent up the hierarchy is over its soft limit
+ *
+ * If the given group doesn't have any children over the limit then it
+ * doesn't make any sense to iterate its subtree.
  */
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+enum mem_cgroup_filter_t
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
                struct mem_cgroup *root)
 {
-       struct mem_cgroup *parent = memcg;
+       struct mem_cgroup *parent;
+
+       if (!memcg)
+               memcg = root_mem_cgroup;
+       parent = memcg;
 
        if (res_counter_soft_limit_excess(&memcg->res))
-               return true;
+               return VISIT;
 
        /*
         * If any parent up to the root in the hierarchy is over its soft limit
@@ -1792,12 +1900,14 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
         */
        while((parent = parent_mem_cgroup(parent))) {
                if (res_counter_soft_limit_excess(&parent->res))
-                       return true;
+                       return VISIT;
                if (parent == root)
                        break;
        }
 
-       return false;
+       if (!atomic_read(&memcg->children_in_excess))
+               return SKIP_TREE;
+       return SKIP;
 }
 
 /*
@@ -5852,6 +5962,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
        vmpressure_init(&memcg->vmpressure);
+       spin_lock_init(&memcg->soft_lock);
 
        return &memcg->css;
 
@@ -5929,6 +6040,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
        mem_cgroup_invalidate_reclaim_iterators(memcg);
        mem_cgroup_reparent_charges(memcg);
+       if (memcg->soft_contributed) {
+               while ((memcg = parent_mem_cgroup(memcg)))
+                       atomic_dec(&memcg->children_in_excess);
+       }
        mem_cgroup_destroy_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }