Merge tag 'linux-kselftest-4.8-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kerne...

[cascardo/linux.git] / mm / oom_kill.c
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index ddf7448..7d0a275 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
  
         /*
          * Do not even consider tasks which are explicitly marked oom
-        * unkillable or have been already oom reaped.
+        * unkillable or have been already oom reaped or the are in
+        * the middle of vfork
          */
         adj = (long)p->signal->oom_score_adj;
         if (adj == OOM_SCORE_ADJ_MIN ||
-                       test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
+                       test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+                       in_vfork(p)) {
                 task_unlock(p);
                 return 0;
         }
@@ -274,17 +276,29 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
  #endif
  
  enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-                       struct task_struct *task, unsigned long totalpages)
+                                       struct task_struct *task)
  {
         if (oom_unkillable_task(task, NULL, oc->nodemask))
                 return OOM_SCAN_CONTINUE;
  
         /*
          * This task already has access to memory reserves and is being killed.
-        * Don't allow any other task to have access to the reserves.
+        * Don't allow any other task to have access to the reserves unless
+        * the task has MMF_OOM_REAPED because chances that it would release
+        * any memory is quite low.
          */
-       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
-               return OOM_SCAN_ABORT;
+       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
+               struct task_struct *p = find_lock_task_mm(task);
+               enum oom_scan_t ret = OOM_SCAN_ABORT;
+
+               if (p) {
+                       if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
+                               ret = OOM_SCAN_CONTINUE;
+                       task_unlock(p);
+               }
+
+               return ret;
+       }
  
         /*
          * If task is allocating a lot of memory and has been marked to be
@@ -311,7 +325,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
         for_each_process(p) {
                 unsigned int points;
  
-               switch (oom_scan_process_thread(oc, p, totalpages)) {
+               switch (oom_scan_process_thread(oc, p)) {
                 case OOM_SCAN_SELECT:
                         chosen = p;
                         chosen_points = ULONG_MAX;
@@ -383,8 +397,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
         rcu_read_unlock();
  }
  
-static void dump_header(struct oom_control *oc, struct task_struct *p,
-                       struct mem_cgroup *memcg)
+static void dump_header(struct oom_control *oc, struct task_struct *p)
  {
         pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
                 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
@@ -392,12 +405,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
  
         cpuset_print_current_mems_allowed();
         dump_stack();
-       if (memcg)
-               mem_cgroup_print_oom_info(memcg, p);
+       if (oc->memcg)
+               mem_cgroup_print_oom_info(oc->memcg, p);
         else
                 show_mem(SHOW_MEM_FILTER_NODES);
         if (sysctl_oom_dump_tasks)
-               dump_tasks(memcg, oc->nodemask);
+               dump_tasks(oc->memcg, oc->nodemask);
  }
  
  /*
@@ -416,7 +429,7 @@ bool oom_killer_disabled __read_mostly;
   * task's threads: if one of those is using this mm then this task was also
   * using it.
   */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
  {
         struct task_struct *t;
  
@@ -453,7 +466,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
          * We have to make sure to not race with the victim exit path
          * and cause premature new oom victim selection:
          * __oom_reap_task              exit_mm
-        *   atomic_inc_not_zero
+        *   mmget_not_zero
          *                                mmput
          *                                  atomic_dec_and_test
          *                                exit_oom_victim
@@ -475,12 +488,22 @@ static bool __oom_reap_task(struct task_struct *tsk)
         if (!p)
                 goto unlock_oom;
         mm = p->mm;
-       atomic_inc(&mm->mm_users);
+       atomic_inc(&mm->mm_count);
         task_unlock(p);
  
         if (!down_read_trylock(&mm->mmap_sem)) {
                 ret = false;
-               goto unlock_oom;
+               goto mm_drop;
+       }
+
+       /*
+        * increase mm_users only after we know we will reap something so
+        * that the mmput_async is called only when we have reaped something
+        * and delayed __mmput doesn't matter that much
+        */
+       if (!mmget_not_zero(mm)) {
+               up_read(&mm->mmap_sem);
+               goto mm_drop;
         }
  
         tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -522,15 +545,16 @@ static bool __oom_reap_task(struct task_struct *tsk)
          * to release its memory.
          */
         set_bit(MMF_OOM_REAPED, &mm->flags);
-unlock_oom:
-       mutex_unlock(&oom_lock);
         /*
          * Drop our reference but make sure the mmput slow path is called from a
          * different context because we shouldn't risk we get stuck there and
          * put the oom_reaper out of the way.
          */
-       if (mm)
-               mmput_async(mm);
+       mmput_async(mm);
+mm_drop:
+       mmdrop(mm);
+unlock_oom:
+       mutex_unlock(&oom_lock);
         return ret;
  }
  
@@ -544,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk)
                 schedule_timeout_idle(HZ/10);
  
         if (attempts > MAX_OOM_REAP_RETRIES) {
+               struct task_struct *p;
+
                 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
                                 task_pid_nr(tsk), tsk->comm);
+
+               /*
+                * If we've already tried to reap this task in the past and
+                * failed it probably doesn't make much sense to try yet again
+                * so hide the mm from the oom killer so that it can move on
+                * to another task with a different mm struct.
+                */
+               p = find_lock_task_mm(tsk);
+               if (p) {
+                       if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
+                               pr_info("oom_reaper: giving up pid:%d (%s)\n",
+                                               task_pid_nr(tsk), tsk->comm);
+                               set_bit(MMF_OOM_REAPED, &p->mm->flags);
+                       }
+                       task_unlock(p);
+               }
+
                 debug_show_all_locks();
         }
  
@@ -584,7 +627,7 @@ static int oom_reaper(void *unused)
         return 0;
  }
  
-static void wake_oom_reaper(struct task_struct *tsk)
+void wake_oom_reaper(struct task_struct *tsk)
  {
         if (!oom_reaper_th)
                 return;
@@ -602,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
         wake_up(&oom_reaper_wait);
  }
  
-/* Check if we can reap the given task. This has to be called with stable
- * tsk->mm
- */
-void try_oom_reaper(struct task_struct *tsk)
-{
-       struct mm_struct *mm = tsk->mm;
-       struct task_struct *p;
-
-       if (!mm)
-               return;
-
-       /*
-        * There might be other threads/processes which are either not
-        * dying or even not killable.
-        */
-       if (atomic_read(&mm->mm_users) > 1) {
-               rcu_read_lock();
-               for_each_process(p) {
-                       if (!process_shares_mm(p, mm))
-                               continue;
-                       if (fatal_signal_pending(p))
-                               continue;
-
-                       /*
-                        * If the task is exiting make sure the whole thread group
-                        * is exiting and cannot acces mm anymore.
-                        */
-                       if (signal_group_exit(p->signal))
-                               continue;
-
-                       /* Give up */
-                       rcu_read_unlock();
-                       return;
-               }
-               rcu_read_unlock();
-       }
-
-       wake_oom_reaper(tsk);
-}
-
  static int __init oom_init(void)
  {
         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -653,10 +656,6 @@ static int __init oom_init(void)
         return 0;
  }
  subsys_initcall(oom_init)
-#else
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-}
  #endif
  
  /**
@@ -733,13 +732,87 @@ void oom_killer_enable(void)
         oom_killer_disabled = false;
  }
  
+static inline bool __task_will_free_mem(struct task_struct *task)
+{
+       struct signal_struct *sig = task->signal;
+
+       /*
+        * A coredumping process may sleep for an extended period in exit_mm(),
+        * so the oom killer cannot assume that the process will promptly exit
+        * and release memory.
+        */
+       if (sig->flags & SIGNAL_GROUP_COREDUMP)
+               return false;
+
+       if (sig->flags & SIGNAL_GROUP_EXIT)
+               return true;
+
+       if (thread_group_empty(task) && (task->flags & PF_EXITING))
+               return true;
+
+       return false;
+}
+
+/*
+ * Checks whether the given task is dying or exiting and likely to
+ * release its address space. This means that all threads and processes
+ * sharing the same mm have to be killed or exiting.
+ * Caller has to make sure that task->mm is stable (hold task_lock or
+ * it operates on the current).
+ */
+bool task_will_free_mem(struct task_struct *task)
+{
+       struct mm_struct *mm = task->mm;
+       struct task_struct *p;
+       bool ret;
+
+       /*
+        * Skip tasks without mm because it might have passed its exit_mm and
+        * exit_oom_victim. oom_reaper could have rescued that but do not rely
+        * on that for now. We can consider find_lock_task_mm in future.
+        */
+       if (!mm)
+               return false;
+
+       if (!__task_will_free_mem(task))
+               return false;
+
+       /*
+        * This task has already been drained by the oom reaper so there are
+        * only small chances it will free some more
+        */
+       if (test_bit(MMF_OOM_REAPED, &mm->flags))
+               return false;
+
+       if (atomic_read(&mm->mm_users) <= 1)
+               return true;
+
+       /*
+        * This is really pessimistic but we do not have any reliable way
+        * to check that external processes share with our mm
+        */
+       rcu_read_lock();
+       for_each_process(p) {
+               if (!process_shares_mm(p, mm))
+                       continue;
+               if (same_thread_group(task, p))
+                       continue;
+               ret = __task_will_free_mem(p);
+               if (!ret)
+                       break;
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
  /*
   * Must be called while holding a reference to p, which will be released upon
   * returning.
   */
  void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                       unsigned int points, unsigned long totalpages,
-                     struct mem_cgroup *memcg, const char *message)
+                     const char *message)
  {
         struct task_struct *victim = p;
         struct task_struct *child;
@@ -755,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
          * its children or threads, just set TIF_MEMDIE so it can die quickly
          */
         task_lock(p);
-       if (p->mm && task_will_free_mem(p)) {
+       if (task_will_free_mem(p)) {
                 mark_oom_victim(p);
-               try_oom_reaper(p);
+               wake_oom_reaper(p);
                 task_unlock(p);
                 put_task_struct(p);
                 return;
@@ -765,7 +838,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         task_unlock(p);
  
         if (__ratelimit(&oom_rs))
-               dump_header(oc, p, memcg);
+               dump_header(oc, p);
  
         pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
                 message, task_pid_nr(p), p->comm, points);
@@ -786,8 +859,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                         /*
                          * oom_badness() returns 0 if the thread is unkillable
                          */
-                       child_points = oom_badness(child, memcg, oc->nodemask,
-                                                               totalpages);
+                       child_points = oom_badness(child,
+                                       oc->memcg, oc->nodemask, totalpages);
                         if (child_points > victim_points) {
                                 put_task_struct(victim);
                                 victim = child;
@@ -840,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                         continue;
                 if (same_thread_group(p, victim))
                         continue;
-               if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
-                   p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+               if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
                         /*
                          * We cannot use oom_reaper for the mm shared by this
                          * process because it wouldn't get killed and so the
-                        * memory might be still used.
+                        * memory might be still used. Hide the mm from the oom
+                        * killer to guarantee OOM forward progress.
                          */
                         can_oom_reap = false;
+                       set_bit(MMF_OOM_REAPED, &mm->flags);
+                       pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
+                                       task_pid_nr(victim), victim->comm,
+                                       task_pid_nr(p), p->comm);
                         continue;
                 }
                 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
@@ -865,8 +942,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
  /*
   * Determines whether the kernel must panic because of the panic_on_oom sysctl.
   */
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
-                       struct mem_cgroup *memcg)
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
  {
         if (likely(!sysctl_panic_on_oom))
                 return;
@@ -882,7 +958,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
         /* Do not panic for oom kills triggered by sysrq */
         if (is_sysrq_oom(oc))
                 return;
-       dump_header(oc, NULL, memcg);
+       dump_header(oc, NULL);
         panic("Out of memory: %s panic_on_oom is enabled\n",
                 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
  }
@@ -930,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc)
          * If current has a pending SIGKILL or is exiting, then automatically
          * select it.  The goal is to allow it to allocate so that it may
          * quickly exit and free its memory.
-        *
-        * But don't select if current has already released its mm and cleared
-        * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
          */
-       if (current->mm &&
-           (fatal_signal_pending(current) || task_will_free_mem(current))) {
+       if (task_will_free_mem(current)) {
                 mark_oom_victim(current);
-               try_oom_reaper(current);
+               wake_oom_reaper(current);
                 return true;
         }
  
@@ -957,13 +1029,13 @@ bool out_of_memory(struct oom_control *oc)
         constraint = constrained_alloc(oc, &totalpages);
         if (constraint != CONSTRAINT_MEMORY_POLICY)
                 oc->nodemask = NULL;
-       check_panic_on_oom(oc, constraint, NULL);
+       check_panic_on_oom(oc, constraint);
  
         if (sysctl_oom_kill_allocating_task && current->mm &&
             !oom_unkillable_task(current, NULL, oc->nodemask) &&
             current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                 get_task_struct(current);
-               oom_kill_process(oc, current, 0, totalpages, NULL,
+               oom_kill_process(oc, current, 0, totalpages,
                                  "Out of memory (oom_kill_allocating_task)");
                 return true;
         }
@@ -971,12 +1043,11 @@ bool out_of_memory(struct oom_control *oc)
         p = select_bad_process(oc, &points, totalpages);
         /* Found nothing?!?! Either we hang forever, or we panic. */
         if (!p && !is_sysrq_oom(oc)) {
-               dump_header(oc, NULL, NULL);
+               dump_header(oc, NULL);
                 panic("Out of memory and no killable processes...\n");
         }
         if (p && p != (void *)-1UL) {
-               oom_kill_process(oc, p, points, totalpages, NULL,
-                                "Out of memory");
+               oom_kill_process(oc, p, points, totalpages, "Out of memory");
                 /*
                  * Give the killed process a good chance to exit before trying
                  * to allocate memory again.
@@ -988,14 +1059,15 @@ bool out_of_memory(struct oom_control *oc)
  
  /*
   * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
- * parallel oom killing is already in progress so do nothing.
+ * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
+ * killing is already in progress so do nothing.
   */
  void pagefault_out_of_memory(void)
  {
         struct oom_control oc = {
                 .zonelist = NULL,
                 .nodemask = NULL,
+               .memcg = NULL,
                 .gfp_mask = 0,
                 .order = 0,
         };