Merge branch 'for-linus-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...

[cascardo/linux.git] / fs / userfaultfd.c
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index af88ef6..634e676 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -45,6 +45,8 @@ struct userfaultfd_ctx {
         wait_queue_head_t fault_wqh;
         /* waitqueue head for the pseudo fd to wakeup poll/read */
         wait_queue_head_t fd_wqh;
+       /* a refile sequence protected by fault_pending_wqh lock */
+       struct seqcount refile_seq;
         /* pseudo fd refcounting */
         atomic_t refcount;
         /* userfaultfd syscall flags */
@@ -262,7 +264,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
         struct userfaultfd_ctx *ctx;
         struct userfaultfd_wait_queue uwq;
         int ret;
-       bool must_wait;
+       bool must_wait, return_to_userland;
  
         BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
  
@@ -327,6 +329,9 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
         uwq.msg = userfault_msg(address, flags, reason);
         uwq.ctx = ctx;
  
+       return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+               (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
         spin_lock(&ctx->fault_pending_wqh.lock);
         /*
          * After the __add_wait_queue the uwq is visible to userland
@@ -338,14 +343,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
          * following the spin_unlock to happen before the list_add in
          * __add_wait_queue.
          */
-       set_current_state(TASK_KILLABLE);
+       set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+                         TASK_KILLABLE);
         spin_unlock(&ctx->fault_pending_wqh.lock);
  
         must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
         up_read(&mm->mmap_sem);
  
         if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
-                  !fatal_signal_pending(current))) {
+                  (return_to_userland ? !signal_pending(current) :
+                   !fatal_signal_pending(current)))) {
                 wake_up_poll(&ctx->fd_wqh, POLLIN);
                 schedule();
                 ret |= VM_FAULT_MAJOR;
@@ -353,6 +360,30 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
  
         __set_current_state(TASK_RUNNING);
  
+       if (return_to_userland) {
+               if (signal_pending(current) &&
+                   !fatal_signal_pending(current)) {
+                       /*
+                        * If we got a SIGSTOP or SIGCONT and this is
+                        * a normal userland page fault, just let
+                        * userland return so the signal will be
+                        * handled and gdb debugging works.  The page
+                        * fault code immediately after we return from
+                        * this function is going to release the
+                        * mmap_sem and it's not depending on it
+                        * (unlike gup would if we were not to return
+                        * VM_FAULT_RETRY).
+                        *
+                        * If a fatal signal is pending we still take
+                        * the streamlined VM_FAULT_RETRY failure path
+                        * and there's no need to retake the mmap_sem
+                        * in such case.
+                        */
+                       down_read(&mm->mmap_sem);
+                       ret = 0;
+               }
+       }
+
         /*
          * Here we race with the list_del; list_add in
          * userfaultfd_ctx_read(), however because we don't ever run
@@ -517,6 +548,15 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                 spin_lock(&ctx->fault_pending_wqh.lock);
                 uwq = find_userfault(ctx);
                 if (uwq) {
+                       /*
+                        * Use a seqcount to repeat the lockless check
+                        * in wake_userfault() to avoid missing
+                        * wakeups because during the refile both
+                        * waitqueue could become empty if this is the
+                        * only userfault.
+                        */
+                       write_seqcount_begin(&ctx->refile_seq);
+
                         /*
                          * The fault_pending_wqh.lock prevents the uwq
                          * to disappear from under us.
@@ -541,6 +581,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                         list_del(&uwq->wq.task_list);
                         __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
  
+                       write_seqcount_end(&ctx->refile_seq);
+
                         /* careful to always initialize msg if ret == 0 */
                         *msg = uwq->msg;
                         spin_unlock(&ctx->fault_pending_wqh.lock);
@@ -618,6 +660,9 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
  static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
                                            struct userfaultfd_wake_range *range)
  {
+       unsigned seq;
+       bool need_wakeup;
+
         /*
          * To be sure waitqueue_active() is not reordered by the CPU
          * before the pagetable update, use an explicit SMP memory
@@ -633,8 +678,13 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
          * userfaults yet. So we take the spinlock only when we're
          * sure we've userfaults to wake.
          */
-       if (waitqueue_active(&ctx->fault_pending_wqh) ||
-           waitqueue_active(&ctx->fault_wqh))
+       do {
+               seq = read_seqcount_begin(&ctx->refile_seq);
+               need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+                       waitqueue_active(&ctx->fault_wqh);
+               cond_resched();
+       } while (read_seqcount_retry(&ctx->refile_seq, seq));
+       if (need_wakeup)
                 __wake_userfault(ctx, range);
  }
  
@@ -1190,6 +1240,7 @@ static void init_once_userfaultfd_ctx(void *mem)
         init_waitqueue_head(&ctx->fault_pending_wqh);
         init_waitqueue_head(&ctx->fault_wqh);
         init_waitqueue_head(&ctx->fd_wqh);
+       seqcount_init(&ctx->refile_seq);
  }
  
  /**