oom, PM: make OOM detection in the freezer path raceless

author Michal Hocko <mhocko@suse.cz>

Wed, 11 Feb 2015 23:26:24 +0000 (15:26 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Feb 2015 01:06:03 +0000 (17:06 -0800)
author Michal Hocko <mhocko@suse.cz>
Wed, 11 Feb 2015 23:26:24 +0000 (15:26 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Feb 2015 01:06:03 +0000 (17:06 -0800)
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c

index 0071469..259a4d5 100644 (file)
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
  
  static void moom_callback(struct work_struct *ignored)
  {
-       out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
-                     0, NULL, true);
+       if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
+                          GFP_KERNEL, 0, NULL, true))
+               pr_info("OOM request ignored because killer is disabled\n");
  }
  
  static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h

index b42b80f..d5771be 100644 (file)
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                 unsigned long totalpages, const nodemask_t *nodemask,
                 bool force_kill);
  
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                 int order, nodemask_t *mask, bool force_kill);
  extern int register_oom_notifier(struct notifier_block *nb);
  extern int unregister_oom_notifier(struct notifier_block *nb);
  
  extern bool oom_killer_disabled;
-
-static inline void oom_killer_disable(void)
-{
-       oom_killer_disabled = true;
-}
-
-static inline void oom_killer_enable(void)
-{
-       oom_killer_disabled = false;
-}
+extern bool oom_killer_disable(void);
+extern void oom_killer_enable(void);
  
  extern struct task_struct *find_lock_task_mm(struct task_struct *p);
  
diff --git a/kernel/exit.c b/kernel/exit.c

index 02b3d1a..feff10b 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
         task_unlock(tsk);
         mm_update_next_owner(mm);
         mmput(mm);
-       unmark_oom_victim();
+       if (test_thread_flag(TIF_MEMDIE))
+               unmark_oom_victim();
  }
  
  static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/power/process.c b/kernel/power/process.c

index 3ac45f1..564f786 100644 (file)
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
         return todo ? -EBUSY : 0;
  }
  
-static bool __check_frozen_processes(void)
-{
-       struct task_struct *g, *p;
-
-       for_each_process_thread(g, p)
-               if (p != current && !freezer_should_skip(p) && !frozen(p))
-                       return false;
-
-       return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
-       bool ret;
-
-       read_lock(&tasklist_lock);
-       ret = __check_frozen_processes();
-       read_unlock(&tasklist_lock);
-       return ret;
-}
-
  /**
   * freeze_processes - Signal user space processes to enter the refrigerator.
   * The current thread will not be frozen.  The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
  int freeze_processes(void)
  {
         int error;
-       int oom_kills_saved;
  
         error = __usermodehelper_disable(UMH_FREEZING);
         if (error)
@@ -157,29 +132,22 @@ int freeze_processes(void)
         pm_wakeup_clear();
         pr_info("Freezing user space processes ... ");
         pm_freezing = true;
-       oom_kills_saved = oom_kills_count();
         error = try_to_freeze_tasks(true);
         if (!error) {
                 __usermodehelper_set_disable_depth(UMH_DISABLED);
-               oom_killer_disable();
-
-               /*
-                * There might have been an OOM kill while we were
-                * freezing tasks and the killed task might be still
-                * on the way out so we have to double check for race.
-                */
-               if (oom_kills_count() != oom_kills_saved &&
-                   !check_frozen_processes()) {
-                       __usermodehelper_set_disable_depth(UMH_ENABLED);
-                       pr_cont("OOM in progress.");
-                       error = -EBUSY;
-               } else {
-                       pr_cont("done.");
-               }
+               pr_cont("done.");
         }
         pr_cont("\n");
         BUG_ON(in_atomic());
  
+       /*
+        * Now that the whole userspace is frozen we need to disbale
+        * the OOM killer to disallow any further interference with
+        * killable tasks.
+        */
+       if (!error && !oom_killer_disable())
+               error = -EBUSY;
+
         if (error)
                 thaw_processes();
         return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index fe4d258..fbf64e6 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
         if (!memcg)
                 return false;
  
-       if (!handle)
+       if (!handle || oom_killer_disabled)
                 goto cleanup;
  
         owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 3cbd76b..b8df76e 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
  }
  
  /*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
   */
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
  
-int oom_kills_count(void)
-{
-       return atomic_read(&oom_kills);
-}
-
-void note_oom_kill(void)
-{
-       atomic_inc(&oom_kills);
-}
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
  
  /**
   * mark_tsk_oom_victim - marks the given taks as OOM victim.
   * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
   */
  void mark_tsk_oom_victim(struct task_struct *tsk)
  {
-       set_tsk_thread_flag(tsk, TIF_MEMDIE);
-
+       WARN_ON(oom_killer_disabled);
+       /* OOM killer might race with memcg OOM */
+       if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+               return;
         /*
          * Make sure that the task is woken up from uninterruptible sleep
          * if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
          * that TIF_MEMDIE tasks should be ignored.
          */
         __thaw_task(tsk);
+       atomic_inc(&oom_victims);
  }
  
  /**
   * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
   */
  void unmark_oom_victim(void)
  {
-       clear_thread_flag(TIF_MEMDIE);
+       if (!test_and_clear_thread_flag(TIF_MEMDIE))
+               return;
+
+       down_read(&oom_sem);
+       /*
+        * There is no need to signal the lasst oom_victim if there
+        * is nobody who cares.
+        */
+       if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+               wake_up_all(&oom_victims_wait);
+       up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+       /*
+        * Make sure to not race with an ongoing OOM killer
+        * and that the current is not the victim.
+        */
+       down_write(&oom_sem);
+       if (test_thread_flag(TIF_MEMDIE)) {
+               up_write(&oom_sem);
+               return false;
+       }
+
+       oom_killer_disabled = true;
+       up_write(&oom_sem);
+
+       wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+       return true;
+}
+
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+       down_write(&oom_sem);
+       oom_killer_disabled = false;
+       up_write(&oom_sem);
  }
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  }
  
  /**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
   * @zonelist: zonelist pointer
   * @gfp_mask: memory allocation flags
   * @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
   * OR try to be smart about which process to kill. Note that we
   * don't have to be perfect here, we just have to be good.
   */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                 int order, nodemask_t *nodemask, bool force_kill)
  {
         const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
                 schedule_timeout_killable(1);
  }
  
+/**
+ * out_of_memory -  tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+               int order, nodemask_t *nodemask, bool force_kill)
+{
+       bool ret = false;
+
+       down_read(&oom_sem);
+       if (!oom_killer_disabled) {
+               __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+               ret = true;
+       }
+       up_read(&oom_sem);
+
+       return ret;
+}
+
  /*
   * The pagefault handler calls here because it is out of memory, so kill a
   * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
  {
         struct zonelist *zonelist;
  
+       down_read(&oom_sem);
         if (mem_cgroup_oom_synchronize(true))
-               return;
+               goto unlock;
  
         zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
         if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-               out_of_memory(NULL, 0, 0, NULL, false);
+               if (!oom_killer_disabled)
+                       __out_of_memory(NULL, 0, 0, NULL, false);
+               else
+                       /*
+                        * There shouldn't be any user tasks runable while the
+                        * OOM killer is disabled so the current task has to
+                        * be a racing OOM victim for which oom_killer_disable()
+                        * is waiting for.
+                        */
+                       WARN_ON(test_thread_flag(TIF_MEMDIE));
+
                 oom_zonelist_unlock(zonelist, GFP_KERNEL);
         }
+unlock:
+       up_read(&oom_sem);
  }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 641d5a9..134e255 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
                                         PB_migrate, PB_migrate_end);
  }
  
-bool oom_killer_disabled __read_mostly;
-
  #ifdef CONFIG_DEBUG_VM
  static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
  {
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  
         *did_some_progress = 0;
  
-       if (oom_killer_disabled)
-               return NULL;
-
         /*
          * Acquire the per-zone oom lock for each zone.  If that
          * fails, somebody else is making progress for us.
@@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                 return NULL;
         }
  
-       /*
-        * PM-freezer should be notified that there might be an OOM killer on
-        * its way to kill and wake somebody up. This is too early and we might
-        * end up not killing anything but false positives are acceptable.
-        * See freeze_processes.
-        */
-       note_oom_kill();
-
         /*
          * Go through the zonelist yet one more time, keep very high watermark
          * here, this is only to catch a parallel oom killing, we must fail if
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         goto out;
         }
         /* Exhausted what can be done so it's blamo time */
-       out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);
-       *did_some_progress = 1;
+       if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+               *did_some_progress = 1;
  out:
         oom_zonelist_unlock(ac->zonelist, gfp_mask);
         return page;
author	Michal Hocko <mhocko@suse.cz>
	Wed, 11 Feb 2015 23:26:24 +0000 (15:26 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Feb 2015 01:06:03 +0000 (17:06 -0800)
drivers/tty/sysrq.c		patch \| blob \| history
include/linux/oom.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/power/process.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history