mm, oom: get rid of signal_struct::oom_victims
[cascardo/linux.git] / mm / oom_kill.c
index d53a9aa..e2a2c35 100644 (file)
@@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
        return oc->order == -1;
 }
 
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+       return oc->memcg != NULL;
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -181,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         */
        adj = (long)p->signal->oom_score_adj;
        if (adj == OOM_SCORE_ADJ_MIN ||
-                       test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+                       test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
                        in_vfork(p)) {
                task_unlock(p);
                return 0;
@@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        return points > 0 ? points : 1;
 }
 
+enum oom_constraint {
+       CONSTRAINT_NONE,
+       CONSTRAINT_CPUSET,
+       CONSTRAINT_MEMORY_POLICY,
+       CONSTRAINT_MEMCG,
+};
+
 /*
  * Determine the type of allocation constraint.
  */
-#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
-                                            unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc)
 {
        struct zone *zone;
        struct zoneref *z;
@@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
        bool cpuset_limited = false;
        int nid;
 
+       if (is_memcg_oom(oc)) {
+               oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+               return CONSTRAINT_MEMCG;
+       }
+
        /* Default to all available memory */
-       *totalpages = totalram_pages + total_swap_pages;
+       oc->totalpages = totalram_pages + total_swap_pages;
+
+       if (!IS_ENABLED(CONFIG_NUMA))
+               return CONSTRAINT_NONE;
 
        if (!oc->zonelist)
                return CONSTRAINT_NONE;
@@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
         */
        if (oc->nodemask &&
            !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
-               *totalpages = total_swap_pages;
+               oc->totalpages = total_swap_pages;
                for_each_node_mask(nid, *oc->nodemask)
-                       *totalpages += node_spanned_pages(nid);
+                       oc->totalpages += node_spanned_pages(nid);
                return CONSTRAINT_MEMORY_POLICY;
        }
 
@@ -259,98 +277,84 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
                        cpuset_limited = true;
 
        if (cpuset_limited) {
-               *totalpages = total_swap_pages;
+               oc->totalpages = total_swap_pages;
                for_each_node_mask(nid, cpuset_current_mems_allowed)
-                       *totalpages += node_spanned_pages(nid);
+                       oc->totalpages += node_spanned_pages(nid);
                return CONSTRAINT_CPUSET;
        }
        return CONSTRAINT_NONE;
 }
-#else
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
-                                            unsigned long *totalpages)
-{
-       *totalpages = totalram_pages + total_swap_pages;
-       return CONSTRAINT_NONE;
-}
-#endif
 
-enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-                                       struct task_struct *task)
+static int oom_evaluate_task(struct task_struct *task, void *arg)
 {
+       struct oom_control *oc = arg;
+       unsigned long points;
+
        if (oom_unkillable_task(task, NULL, oc->nodemask))
-               return OOM_SCAN_CONTINUE;
+               goto next;
 
        /*
         * This task already has access to memory reserves and is being killed.
         * Don't allow any other task to have access to the reserves unless
-        * the task has MMF_OOM_REAPED because chances that it would release
+        * the task has MMF_OOM_SKIP because chances that it would release
         * any memory is quite low.
         */
-       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
-               struct task_struct *p = find_lock_task_mm(task);
-               enum oom_scan_t ret = OOM_SCAN_ABORT;
-
-               if (p) {
-                       if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
-                               ret = OOM_SCAN_CONTINUE;
-                       task_unlock(p);
-               }
-
-               return ret;
+       if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
+               if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+                       goto next;
+               goto abort;
        }
 
        /*
         * If task is allocating a lot of memory and has been marked to be
         * killed first if it triggers an oom, then select it.
         */
-       if (oom_task_origin(task))
-               return OOM_SCAN_SELECT;
+       if (oom_task_origin(task)) {
+               points = ULONG_MAX;
+               goto select;
+       }
 
-       return OOM_SCAN_OK;
+       points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+       if (!points || points < oc->chosen_points)
+               goto next;
+
+       /* Prefer thread group leaders for display purposes */
+       if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+               goto next;
+select:
+       if (oc->chosen)
+               put_task_struct(oc->chosen);
+       get_task_struct(task);
+       oc->chosen = task;
+       oc->chosen_points = points;
+next:
+       return 0;
+abort:
+       if (oc->chosen)
+               put_task_struct(oc->chosen);
+       oc->chosen = (void *)-1UL;
+       return 1;
 }
 
 /*
- * Simple selection loop. We chose the process with the highest
- * number of 'points'.  Returns -1 on scan abort.
+ * Simple selection loop. We choose the process with the highest number of
+ * 'points'. In case scan was aborted, oc->chosen is set to -1.
  */
-static struct task_struct *select_bad_process(struct oom_control *oc,
-               unsigned int *ppoints, unsigned long totalpages)
+static void select_bad_process(struct oom_control *oc)
 {
-       struct task_struct *p;
-       struct task_struct *chosen = NULL;
-       unsigned long chosen_points = 0;
-
-       rcu_read_lock();
-       for_each_process(p) {
-               unsigned int points;
-
-               switch (oom_scan_process_thread(oc, p)) {
-               case OOM_SCAN_SELECT:
-                       chosen = p;
-                       chosen_points = ULONG_MAX;
-                       /* fall through */
-               case OOM_SCAN_CONTINUE:
-                       continue;
-               case OOM_SCAN_ABORT:
-                       rcu_read_unlock();
-                       return (struct task_struct *)(-1UL);
-               case OOM_SCAN_OK:
-                       break;
-               };
-               points = oom_badness(p, NULL, oc->nodemask, totalpages);
-               if (!points || points < chosen_points)
-                       continue;
+       if (is_memcg_oom(oc))
+               mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
+       else {
+               struct task_struct *p;
 
-               chosen = p;
-               chosen_points = points;
+               rcu_read_lock();
+               for_each_process(p)
+                       if (oom_evaluate_task(p, oc))
+                               break;
+               rcu_read_unlock();
        }
-       if (chosen)
-               get_task_struct(chosen);
-       rcu_read_unlock();
 
-       *ppoints = chosen_points * 1000 / totalpages;
-       return chosen;
+       oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
 }
 
 /**
@@ -419,7 +423,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 static atomic_t oom_victims = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 
-bool oom_killer_disabled __read_mostly;
+static bool oom_killer_disabled __read_mostly;
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
@@ -452,12 +456,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-static bool __oom_reap_task(struct task_struct *tsk)
+static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
-       struct mm_struct *mm = NULL;
-       struct task_struct *p;
        struct zap_details details = {.check_swap_entries = true,
                                      .ignore_dirty = true};
        bool ret = true;
@@ -465,7 +467,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
        /*
         * We have to make sure to not race with the victim exit path
         * and cause premature new oom victim selection:
-        * __oom_reap_task              exit_mm
+        * __oom_reap_task_mm           exit_mm
         *   mmget_not_zero
         *                                mmput
         *                                  atomic_dec_and_test
@@ -478,22 +480,9 @@ static bool __oom_reap_task(struct task_struct *tsk)
         */
        mutex_lock(&oom_lock);
 
-       /*
-        * Make sure we find the associated mm_struct even when the particular
-        * thread has already terminated and cleared its mm.
-        * We might have race with exit path so consider our work done if there
-        * is no mm.
-        */
-       p = find_lock_task_mm(tsk);
-       if (!p)
-               goto unlock_oom;
-       mm = p->mm;
-       atomic_inc(&mm->mm_count);
-       task_unlock(p);
-
        if (!down_read_trylock(&mm->mmap_sem)) {
                ret = false;
-               goto mm_drop;
+               goto unlock_oom;
        }
 
        /*
@@ -503,7 +492,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
         */
        if (!mmget_not_zero(mm)) {
                up_read(&mm->mmap_sem);
-               goto mm_drop;
+               goto unlock_oom;
        }
 
        tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -540,19 +529,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
        up_read(&mm->mmap_sem);
 
-       /*
-        * This task can be safely ignored because we cannot do much more
-        * to release its memory.
-        */
-       set_bit(MMF_OOM_REAPED, &mm->flags);
        /*
         * Drop our reference but make sure the mmput slow path is called from a
         * different context because we shouldn't risk we get stuck there and
         * put the oom_reaper out of the way.
         */
        mmput_async(mm);
-mm_drop:
-       mmdrop(mm);
 unlock_oom:
        mutex_unlock(&oom_lock);
        return ret;
@@ -562,36 +544,21 @@ unlock_oom:
 static void oom_reap_task(struct task_struct *tsk)
 {
        int attempts = 0;
+       struct mm_struct *mm = tsk->signal->oom_mm;
 
        /* Retry the down_read_trylock(mmap_sem) a few times */
-       while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+       while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
                schedule_timeout_idle(HZ/10);
 
-       if (attempts > MAX_OOM_REAP_RETRIES) {
-               struct task_struct *p;
-
-               pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-                               task_pid_nr(tsk), tsk->comm);
+       if (attempts <= MAX_OOM_REAP_RETRIES)
+               goto done;
 
-               /*
-                * If we've already tried to reap this task in the past and
-                * failed it probably doesn't make much sense to try yet again
-                * so hide the mm from the oom killer so that it can move on
-                * to another task with a different mm struct.
-                */
-               p = find_lock_task_mm(tsk);
-               if (p) {
-                       if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
-                               pr_info("oom_reaper: giving up pid:%d (%s)\n",
-                                               task_pid_nr(tsk), tsk->comm);
-                               set_bit(MMF_OOM_REAPED, &p->mm->flags);
-                       }
-                       task_unlock(p);
-               }
 
-               debug_show_all_locks();
-       }
+       pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+               task_pid_nr(tsk), tsk->comm);
+       debug_show_all_locks();
 
+done:
        /*
         * Clear TIF_MEMDIE because the task shouldn't be sitting on a
         * reasonably reclaimable memory anymore or it is not a good candidate
@@ -601,6 +568,12 @@ static void oom_reap_task(struct task_struct *tsk)
        tsk->oom_reaper_list = NULL;
        exit_oom_victim(tsk);
 
+       /*
+        * Hide this mm from OOM killer because it has been either reaped or
+        * somebody can't call up_write(mmap_sem).
+        */
+       set_bit(MMF_OOM_SKIP, &mm->flags);
+
        /* Drop a reference taken by wake_oom_reaper */
        put_task_struct(tsk);
 }
@@ -627,7 +600,7 @@ static int oom_reaper(void *unused)
        return 0;
 }
 
-void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct task_struct *tsk)
 {
        if (!oom_reaper_th)
                return;
@@ -656,7 +629,11 @@ static int __init oom_init(void)
        return 0;
 }
 subsys_initcall(oom_init)
-#endif
+#else
+static inline void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif /* CONFIG_MMU */
 
 /**
  * mark_oom_victim - mark the given task as OOM victim
@@ -664,14 +641,23 @@ subsys_initcall(oom_init)
  *
  * Has to be called with oom_lock held and never after
  * oom has been disabled already.
+ *
+ * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
+ * under task_lock or operate on the current).
  */
-void mark_oom_victim(struct task_struct *tsk)
+static void mark_oom_victim(struct task_struct *tsk)
 {
+       struct mm_struct *mm = tsk->mm;
+
        WARN_ON(oom_killer_disabled);
        /* OOM killer might race with memcg OOM */
        if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
                return;
-       atomic_inc(&tsk->signal->oom_victims);
+
+       /* oom_mm is bound to the signal struct life time. */
+       if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+               atomic_inc(&tsk->signal->oom_mm->mm_count);
+
        /*
         * Make sure that the task is woken up from uninterruptible sleep
         * if it is frozen because OOM killer wouldn't be able to free
@@ -689,7 +675,6 @@ void exit_oom_victim(struct task_struct *tsk)
 {
        if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
                return;
-       atomic_dec(&tsk->signal->oom_victims);
 
        if (!atomic_dec_return(&oom_victims))
                wake_up_all(&oom_victims_wait);
@@ -760,7 +745,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
  * Caller has to make sure that task->mm is stable (hold task_lock or
  * it operates on the current).
  */
-bool task_will_free_mem(struct task_struct *task)
+static bool task_will_free_mem(struct task_struct *task)
 {
        struct mm_struct *mm = task->mm;
        struct task_struct *p;
@@ -781,15 +766,16 @@ bool task_will_free_mem(struct task_struct *task)
         * This task has already been drained by the oom reaper so there are
         * only small chances it will free some more
         */
-       if (test_bit(MMF_OOM_REAPED, &mm->flags))
+       if (test_bit(MMF_OOM_SKIP, &mm->flags))
                return false;
 
        if (atomic_read(&mm->mm_users) <= 1)
                return true;
 
        /*
-        * This is really pessimistic but we do not have any reliable way
-        * to check that external processes share with our mm
+        * Make sure that all tasks which share the mm with the given tasks
+        * are dying as well to make sure that a) nobody pins its mm and
+        * b) the task is also reapable by the oom reaper.
         */
        rcu_read_lock();
        for_each_process(p) {
@@ -806,14 +792,10 @@ bool task_will_free_mem(struct task_struct *task)
        return ret;
 }
 
-/*
- * Must be called while holding a reference to p, which will be released upon
- * returning.
- */
-void oom_kill_process(struct oom_control *oc, struct task_struct *p,
-                     unsigned int points, unsigned long totalpages,
-                     const char *message)
+static void oom_kill_process(struct oom_control *oc, const char *message)
 {
+       struct task_struct *p = oc->chosen;
+       unsigned int points = oc->chosen_points;
        struct task_struct *victim = p;
        struct task_struct *child;
        struct task_struct *t;
@@ -860,7 +842,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                         * oom_badness() returns 0 if the thread is unkillable
                         */
                        child_points = oom_badness(child,
-                                       oc->memcg, oc->nodemask, totalpages);
+                               oc->memcg, oc->nodemask, oc->totalpages);
                        if (child_points > victim_points) {
                                put_task_struct(victim);
                                victim = child;
@@ -921,7 +903,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                         * killer to guarantee OOM forward progress.
                         */
                        can_oom_reap = false;
-                       set_bit(MMF_OOM_REAPED, &mm->flags);
+                       set_bit(MMF_OOM_SKIP, &mm->flags);
                        pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                                        task_pid_nr(victim), victim->comm,
                                        task_pid_nr(p), p->comm);
@@ -942,7 +924,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
+static void check_panic_on_oom(struct oom_control *oc,
+                              enum oom_constraint constraint)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -988,19 +971,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  */
 bool out_of_memory(struct oom_control *oc)
 {
-       struct task_struct *p;
-       unsigned long totalpages;
        unsigned long freed = 0;
-       unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
 
        if (oom_killer_disabled)
                return false;
 
-       blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
-       if (freed > 0)
-               /* Got some memory back in the last second. */
-               return true;
+       if (!is_memcg_oom(oc)) {
+               blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+               if (freed > 0)
+                       /* Got some memory back in the last second. */
+                       return true;
+       }
 
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -1024,37 +1006,38 @@ bool out_of_memory(struct oom_control *oc)
 
        /*
         * Check if there were limitations on the allocation (only relevant for
-        * NUMA) that may require different handling.
+        * NUMA and memcg) that may require different handling.
         */
-       constraint = constrained_alloc(oc, &totalpages);
+       constraint = constrained_alloc(oc);
        if (constraint != CONSTRAINT_MEMORY_POLICY)
                oc->nodemask = NULL;
        check_panic_on_oom(oc, constraint);
 
-       if (sysctl_oom_kill_allocating_task && current->mm &&
-           !oom_unkillable_task(current, NULL, oc->nodemask) &&
+       if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
+           current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                get_task_struct(current);
-               oom_kill_process(oc, current, 0, totalpages,
-                                "Out of memory (oom_kill_allocating_task)");
+               oc->chosen = current;
+               oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
                return true;
        }
 
-       p = select_bad_process(oc, &points, totalpages);
+       select_bad_process(oc);
        /* Found nothing?!?! Either we hang forever, or we panic. */
-       if (!p && !is_sysrq_oom(oc)) {
+       if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
                dump_header(oc, NULL);
                panic("Out of memory and no killable processes...\n");
        }
-       if (p && p != (void *)-1UL) {
-               oom_kill_process(oc, p, points, totalpages, "Out of memory");
+       if (oc->chosen && oc->chosen != (void *)-1UL) {
+               oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+                                "Memory cgroup out of memory");
                /*
                 * Give the killed process a good chance to exit before trying
                 * to allocate memory again.
                 */
                schedule_timeout_killable(1);
        }
-       return true;
+       return !!oc->chosen;
 }
 
 /*