wait_queue_head_t fault_wqh;
/* waitqueue head for the pseudo fd to wakeup poll/read */
wait_queue_head_t fd_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ struct seqcount refile_seq;
/* pseudo fd refcounting */
atomic_t refcount;
/* userfaultfd syscall flags */
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
- bool must_wait;
+ bool must_wait, return_to_userland;
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
uwq.msg = userfault_msg(address, flags, reason);
uwq.ctx = ctx;
+ return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
spin_lock(&ctx->fault_pending_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
* following the spin_unlock to happen before the list_add in
* __add_wait_queue.
*/
- set_current_state(TASK_KILLABLE);
+ set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+ TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);
must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
- !fatal_signal_pending(current))) {
+ (return_to_userland ? !signal_pending(current) :
+ !fatal_signal_pending(current)))) {
wake_up_poll(&ctx->fd_wqh, POLLIN);
schedule();
ret |= VM_FAULT_MAJOR;
__set_current_state(TASK_RUNNING);
+ if (return_to_userland) {
+ if (signal_pending(current) &&
+ !fatal_signal_pending(current)) {
+ /*
+ * If we got a SIGSTOP or SIGCONT and this is
+ * a normal userland page fault, just let
+ * userland return so the signal will be
+ * handled and gdb debugging works. The page
+ * fault code immediately after we return from
+ * this function is going to release the
+ * mmap_sem and it's not depending on it
+ * (unlike gup would if we were not to return
+ * VM_FAULT_RETRY).
+ *
+ * If a fatal signal is pending we still take
+ * the streamlined VM_FAULT_RETRY failure path
+ * and there's no need to retake the mmap_sem
+ * in such case.
+ */
+ down_read(&mm->mmap_sem);
+ ret = 0;
+ }
+ }
+
/*
* Here we race with the list_del; list_add in
* userfaultfd_ctx_read(), however because we don't ever run
spin_lock(&ctx->fault_pending_wqh.lock);
uwq = find_userfault(ctx);
if (uwq) {
+ /*
+ * Use a seqcount to repeat the lockless check
+ * in wake_userfault() to avoid missing
+ * wakeups because during the refile both
+ * waitqueue could become empty if this is the
+ * only userfault.
+ */
+ write_seqcount_begin(&ctx->refile_seq);
+
/*
* The fault_pending_wqh.lock prevents the uwq
* to disappear from under us.
list_del(&uwq->wq.task_list);
__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+ write_seqcount_end(&ctx->refile_seq);
+
/* careful to always initialize msg if ret == 0 */
*msg = uwq->msg;
spin_unlock(&ctx->fault_pending_wqh.lock);
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
+ unsigned seq;
+ bool need_wakeup;
+
/*
* To be sure waitqueue_active() is not reordered by the CPU
* before the pagetable update, use an explicit SMP memory
* userfaults yet. So we take the spinlock only when we're
* sure we've userfaults to wake.
*/
- if (waitqueue_active(&ctx->fault_pending_wqh) ||
- waitqueue_active(&ctx->fault_wqh))
+ do {
+ seq = read_seqcount_begin(&ctx->refile_seq);
+ need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh);
+ cond_resched();
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ if (need_wakeup)
__wake_userfault(ctx, range);
}
init_waitqueue_head(&ctx->fault_pending_wqh);
init_waitqueue_head(&ctx->fault_wqh);
init_waitqueue_head(&ctx->fd_wqh);
+ seqcount_init(&ctx->refile_seq);
}
/**