Merge tag 'freevxfs-for-4.8' of git://git.infradead.org/users/hch/freevxfs
[cascardo/linux.git] / kernel / futex.c
1 /*
2  *  Fast Userspace Mutexes (which I call "Futexes!").
3  *  (C) Rusty Russell, IBM 2002
4  *
5  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7  *
8  *  Removed page pinning, fix privately mapped COW pages and other cleanups
9  *  (C) Copyright 2003, 2004 Jamie Lokier
10  *
11  *  Robust futex support started by Ingo Molnar
12  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14  *
15  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
16  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18  *
19  *  PRIVATE futexes by Eric Dumazet
20  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21  *
22  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23  *  Copyright (C) IBM Corporation, 2009
24  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
25  *
26  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27  *  enough at me, Linus for the original (flawed) idea, Matthew
28  *  Kirkwood for proof-of-concept implementation.
29  *
30  *  "The futexes are also cursed."
31  *  "But they come in a choice of three flavours!"
32  *
33  *  This program is free software; you can redistribute it and/or modify
34  *  it under the terms of the GNU General Public License as published by
35  *  the Free Software Foundation; either version 2 of the License, or
36  *  (at your option) any later version.
37  *
38  *  This program is distributed in the hope that it will be useful,
39  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
40  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
41  *  GNU General Public License for more details.
42  *
43  *  You should have received a copy of the GNU General Public License
44  *  along with this program; if not, write to the Free Software
45  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46  */
47 #include <linux/slab.h>
48 #include <linux/poll.h>
49 #include <linux/fs.h>
50 #include <linux/file.h>
51 #include <linux/jhash.h>
52 #include <linux/init.h>
53 #include <linux/futex.h>
54 #include <linux/mount.h>
55 #include <linux/pagemap.h>
56 #include <linux/syscalls.h>
57 #include <linux/signal.h>
58 #include <linux/export.h>
59 #include <linux/magic.h>
60 #include <linux/pid.h>
61 #include <linux/nsproxy.h>
62 #include <linux/ptrace.h>
63 #include <linux/sched/rt.h>
64 #include <linux/hugetlb.h>
65 #include <linux/freezer.h>
66 #include <linux/bootmem.h>
67 #include <linux/fault-inject.h>
68
69 #include <asm/futex.h>
70
71 #include "locking/rtmutex_common.h"
72
73 /*
74  * READ this before attempting to hack on futexes!
75  *
76  * Basic futex operation and ordering guarantees
77  * =============================================
78  *
79  * The waiter reads the futex value in user space and calls
80  * futex_wait(). This function computes the hash bucket and acquires
81  * the hash bucket lock. After that it reads the futex user space value
82  * again and verifies that the data has not changed. If it has not changed
83  * it enqueues itself into the hash bucket, releases the hash bucket lock
84  * and schedules.
85  *
86  * The waker side modifies the user space value of the futex and calls
87  * futex_wake(). This function computes the hash bucket and acquires the
88  * hash bucket lock. Then it looks for waiters on that futex in the hash
89  * bucket and wakes them.
90  *
91  * In futex wake up scenarios where no tasks are blocked on a futex, taking
92  * the hb spinlock can be avoided and simply return. In order for this
93  * optimization to work, ordering guarantees must exist so that the waiter
94  * being added to the list is acknowledged when the list is concurrently being
95  * checked by the waker, avoiding scenarios like the following:
96  *
97  * CPU 0                               CPU 1
98  * val = *futex;
99  * sys_futex(WAIT, futex, val);
100  *   futex_wait(futex, val);
101  *   uval = *futex;
102  *                                     *futex = newval;
103  *                                     sys_futex(WAKE, futex);
104  *                                       futex_wake(futex);
105  *                                       if (queue_empty())
106  *                                         return;
107  *   if (uval == val)
108  *      lock(hash_bucket(futex));
109  *      queue();
110  *     unlock(hash_bucket(futex));
111  *     schedule();
112  *
113  * This would cause the waiter on CPU 0 to wait forever because it
114  * missed the transition of the user space value from val to newval
115  * and the waker did not find the waiter in the hash bucket queue.
116  *
117  * The correct serialization ensures that a waiter either observes
118  * the changed user space value before blocking or is woken by a
119  * concurrent waker:
120  *
121  * CPU 0                                 CPU 1
122  * val = *futex;
123  * sys_futex(WAIT, futex, val);
124  *   futex_wait(futex, val);
125  *
126  *   waiters++; (a)
127  *   smp_mb(); (A) <-- paired with -.
128  *                                  |
129  *   lock(hash_bucket(futex));      |
130  *                                  |
131  *   uval = *futex;                 |
132  *                                  |        *futex = newval;
133  *                                  |        sys_futex(WAKE, futex);
134  *                                  |          futex_wake(futex);
135  *                                  |
136  *                                  `--------> smp_mb(); (B)
137  *   if (uval == val)
138  *     queue();
139  *     unlock(hash_bucket(futex));
140  *     schedule();                         if (waiters)
141  *                                           lock(hash_bucket(futex));
142  *   else                                    wake_waiters(futex);
143  *     waiters--; (b)                        unlock(hash_bucket(futex));
144  *
145  * Where (A) orders the waiters increment and the futex value read through
146  * atomic operations (see hb_waiters_inc) and where (B) orders the write
147  * to futex and the waiters read -- this is done by the barriers for both
148  * shared and private futexes in get_futex_key_refs().
149  *
150  * This yields the following case (where X:=waiters, Y:=futex):
151  *
152  *      X = Y = 0
153  *
154  *      w[X]=1          w[Y]=1
155  *      MB              MB
156  *      r[Y]=y          r[X]=x
157  *
158  * Which guarantees that x==0 && y==0 is impossible; which translates back into
159  * the guarantee that we cannot both miss the futex variable change and the
160  * enqueue.
161  *
162  * Note that a new waiter is accounted for in (a) even when it is possible that
163  * the wait call can return error, in which case we backtrack from it in (b).
164  * Refer to the comment in queue_lock().
165  *
166  * Similarly, in order to account for waiters being requeued on another
167  * address we always increment the waiters for the destination bucket before
168  * acquiring the lock. It then decrements them again  after releasing it -
169  * the code that actually moves the futex(es) between hash buckets (requeue_futex)
170  * will do the additional required waiter count housekeeping. This is done for
171  * double_lock_hb() and double_unlock_hb(), respectively.
172  */
173
174 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
175 int __read_mostly futex_cmpxchg_enabled;
176 #endif
177
178 /*
179  * Futex flags used to encode options to functions and preserve them across
180  * restarts.
181  */
182 #define FLAGS_SHARED            0x01
183 #define FLAGS_CLOCKRT           0x02
184 #define FLAGS_HAS_TIMEOUT       0x04
185
186 /*
187  * Priority Inheritance state:
188  */
189 struct futex_pi_state {
190         /*
191          * list of 'owned' pi_state instances - these have to be
192          * cleaned up in do_exit() if the task exits prematurely:
193          */
194         struct list_head list;
195
196         /*
197          * The PI object:
198          */
199         struct rt_mutex pi_mutex;
200
201         struct task_struct *owner;
202         atomic_t refcount;
203
204         union futex_key key;
205 };
206
207 /**
208  * struct futex_q - The hashed futex queue entry, one per waiting task
209  * @list:               priority-sorted list of tasks waiting on this futex
210  * @task:               the task waiting on the futex
211  * @lock_ptr:           the hash bucket lock
212  * @key:                the key the futex is hashed on
213  * @pi_state:           optional priority inheritance state
214  * @rt_waiter:          rt_waiter storage for use with requeue_pi
215  * @requeue_pi_key:     the requeue_pi target futex key
216  * @bitset:             bitset for the optional bitmasked wakeup
217  *
218  * We use this hashed waitqueue, instead of a normal wait_queue_t, so
219  * we can wake only the relevant ones (hashed queues may be shared).
220  *
221  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
222  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
223  * The order of wakeup is always to make the first condition true, then
224  * the second.
225  *
226  * PI futexes are typically woken before they are removed from the hash list via
227  * the rt_mutex code. See unqueue_me_pi().
228  */
229 struct futex_q {
230         struct plist_node list;
231
232         struct task_struct *task;
233         spinlock_t *lock_ptr;
234         union futex_key key;
235         struct futex_pi_state *pi_state;
236         struct rt_mutex_waiter *rt_waiter;
237         union futex_key *requeue_pi_key;
238         u32 bitset;
239 };
240
241 static const struct futex_q futex_q_init = {
242         /* list gets initialized in queue_me()*/
243         .key = FUTEX_KEY_INIT,
244         .bitset = FUTEX_BITSET_MATCH_ANY
245 };
246
247 /*
248  * Hash buckets are shared by all the futex_keys that hash to the same
249  * location.  Each key may have multiple futex_q structures, one for each task
250  * waiting on a futex.
251  */
252 struct futex_hash_bucket {
253         atomic_t waiters;
254         spinlock_t lock;
255         struct plist_head chain;
256 } ____cacheline_aligned_in_smp;
257
258 /*
259  * The base of the bucket array and its size are always used together
260  * (after initialization only in hash_futex()), so ensure that they
261  * reside in the same cacheline.
262  */
263 static struct {
264         struct futex_hash_bucket *queues;
265         unsigned long            hashsize;
266 } __futex_data __read_mostly __aligned(2*sizeof(long));
267 #define futex_queues   (__futex_data.queues)
268 #define futex_hashsize (__futex_data.hashsize)
269
270
271 /*
272  * Fault injections for futexes.
273  */
274 #ifdef CONFIG_FAIL_FUTEX
275
276 static struct {
277         struct fault_attr attr;
278
279         bool ignore_private;
280 } fail_futex = {
281         .attr = FAULT_ATTR_INITIALIZER,
282         .ignore_private = false,
283 };
284
285 static int __init setup_fail_futex(char *str)
286 {
287         return setup_fault_attr(&fail_futex.attr, str);
288 }
289 __setup("fail_futex=", setup_fail_futex);
290
291 static bool should_fail_futex(bool fshared)
292 {
293         if (fail_futex.ignore_private && !fshared)
294                 return false;
295
296         return should_fail(&fail_futex.attr, 1);
297 }
298
299 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
300
301 static int __init fail_futex_debugfs(void)
302 {
303         umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
304         struct dentry *dir;
305
306         dir = fault_create_debugfs_attr("fail_futex", NULL,
307                                         &fail_futex.attr);
308         if (IS_ERR(dir))
309                 return PTR_ERR(dir);
310
311         if (!debugfs_create_bool("ignore-private", mode, dir,
312                                  &fail_futex.ignore_private)) {
313                 debugfs_remove_recursive(dir);
314                 return -ENOMEM;
315         }
316
317         return 0;
318 }
319
320 late_initcall(fail_futex_debugfs);
321
322 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
323
324 #else
325 static inline bool should_fail_futex(bool fshared)
326 {
327         return false;
328 }
329 #endif /* CONFIG_FAIL_FUTEX */
330
331 static inline void futex_get_mm(union futex_key *key)
332 {
333         atomic_inc(&key->private.mm->mm_count);
334         /*
335          * Ensure futex_get_mm() implies a full barrier such that
336          * get_futex_key() implies a full barrier. This is relied upon
337          * as smp_mb(); (B), see the ordering comment above.
338          */
339         smp_mb__after_atomic();
340 }
341
342 /*
343  * Reflects a new waiter being added to the waitqueue.
344  */
345 static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
346 {
347 #ifdef CONFIG_SMP
348         atomic_inc(&hb->waiters);
349         /*
350          * Full barrier (A), see the ordering comment above.
351          */
352         smp_mb__after_atomic();
353 #endif
354 }
355
356 /*
357  * Reflects a waiter being removed from the waitqueue by wakeup
358  * paths.
359  */
360 static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
361 {
362 #ifdef CONFIG_SMP
363         atomic_dec(&hb->waiters);
364 #endif
365 }
366
367 static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
368 {
369 #ifdef CONFIG_SMP
370         return atomic_read(&hb->waiters);
371 #else
372         return 1;
373 #endif
374 }
375
376 /*
377  * We hash on the keys returned from get_futex_key (see below).
378  */
379 static struct futex_hash_bucket *hash_futex(union futex_key *key)
380 {
381         u32 hash = jhash2((u32*)&key->both.word,
382                           (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
383                           key->both.offset);
384         return &futex_queues[hash & (futex_hashsize - 1)];
385 }
386
387 /*
388  * Return 1 if two futex_keys are equal, 0 otherwise.
389  */
390 static inline int match_futex(union futex_key *key1, union futex_key *key2)
391 {
392         return (key1 && key2
393                 && key1->both.word == key2->both.word
394                 && key1->both.ptr == key2->both.ptr
395                 && key1->both.offset == key2->both.offset);
396 }
397
398 /*
399  * Take a reference to the resource addressed by a key.
400  * Can be called while holding spinlocks.
401  *
402  */
403 static void get_futex_key_refs(union futex_key *key)
404 {
405         if (!key->both.ptr)
406                 return;
407
408         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
409         case FUT_OFF_INODE:
410                 ihold(key->shared.inode); /* implies smp_mb(); (B) */
411                 break;
412         case FUT_OFF_MMSHARED:
413                 futex_get_mm(key); /* implies smp_mb(); (B) */
414                 break;
415         default:
416                 /*
417                  * Private futexes do not hold reference on an inode or
418                  * mm, therefore the only purpose of calling get_futex_key_refs
419                  * is because we need the barrier for the lockless waiter check.
420                  */
421                 smp_mb(); /* explicit smp_mb(); (B) */
422         }
423 }
424
425 /*
426  * Drop a reference to the resource addressed by a key.
427  * The hash bucket spinlock must not be held. This is
428  * a no-op for private futexes, see comment in the get
429  * counterpart.
430  */
431 static void drop_futex_key_refs(union futex_key *key)
432 {
433         if (!key->both.ptr) {
434                 /* If we're here then we tried to put a key we failed to get */
435                 WARN_ON_ONCE(1);
436                 return;
437         }
438
439         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
440         case FUT_OFF_INODE:
441                 iput(key->shared.inode);
442                 break;
443         case FUT_OFF_MMSHARED:
444                 mmdrop(key->private.mm);
445                 break;
446         }
447 }
448
449 /**
450  * get_futex_key() - Get parameters which are the keys for a futex
451  * @uaddr:      virtual address of the futex
452  * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
453  * @key:        address where result is stored.
454  * @rw:         mapping needs to be read/write (values: VERIFY_READ,
455  *              VERIFY_WRITE)
456  *
457  * Return: a negative error code or 0
458  *
459  * The key words are stored in *key on success.
460  *
461  * For shared mappings, it's (page->index, file_inode(vma->vm_file),
462  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
463  * We can usually work out the index without swapping in the page.
464  *
465  * lock_page() might sleep, the caller should not hold a spinlock.
466  */
467 static int
468 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
469 {
470         unsigned long address = (unsigned long)uaddr;
471         struct mm_struct *mm = current->mm;
472         struct page *page, *tail;
473         struct address_space *mapping;
474         int err, ro = 0;
475
476         /*
477          * The futex address must be "naturally" aligned.
478          */
479         key->both.offset = address % PAGE_SIZE;
480         if (unlikely((address % sizeof(u32)) != 0))
481                 return -EINVAL;
482         address -= key->both.offset;
483
484         if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
485                 return -EFAULT;
486
487         if (unlikely(should_fail_futex(fshared)))
488                 return -EFAULT;
489
490         /*
491          * PROCESS_PRIVATE futexes are fast.
492          * As the mm cannot disappear under us and the 'key' only needs
493          * virtual address, we dont even have to find the underlying vma.
494          * Note : We do have to check 'uaddr' is a valid user address,
495          *        but access_ok() should be faster than find_vma()
496          */
497         if (!fshared) {
498                 key->private.mm = mm;
499                 key->private.address = address;
500                 get_futex_key_refs(key);  /* implies smp_mb(); (B) */
501                 return 0;
502         }
503
504 again:
505         /* Ignore any VERIFY_READ mapping (futex common case) */
506         if (unlikely(should_fail_futex(fshared)))
507                 return -EFAULT;
508
509         err = get_user_pages_fast(address, 1, 1, &page);
510         /*
511          * If write access is not required (eg. FUTEX_WAIT), try
512          * and get read-only access.
513          */
514         if (err == -EFAULT && rw == VERIFY_READ) {
515                 err = get_user_pages_fast(address, 1, 0, &page);
516                 ro = 1;
517         }
518         if (err < 0)
519                 return err;
520         else
521                 err = 0;
522
523         /*
524          * The treatment of mapping from this point on is critical. The page
525          * lock protects many things but in this context the page lock
526          * stabilizes mapping, prevents inode freeing in the shared
527          * file-backed region case and guards against movement to swap cache.
528          *
529          * Strictly speaking the page lock is not needed in all cases being
530          * considered here and page lock forces unnecessarily serialization
531          * From this point on, mapping will be re-verified if necessary and
532          * page lock will be acquired only if it is unavoidable
533          *
534          * Mapping checks require the head page for any compound page so the
535          * head page and mapping is looked up now. For anonymous pages, it
536          * does not matter if the page splits in the future as the key is
537          * based on the address. For filesystem-backed pages, the tail is
538          * required as the index of the page determines the key. For
539          * base pages, there is no tail page and tail == page.
540          */
541         tail = page;
542         page = compound_head(page);
543         mapping = READ_ONCE(page->mapping);
544
545         /*
546          * If page->mapping is NULL, then it cannot be a PageAnon
547          * page; but it might be the ZERO_PAGE or in the gate area or
548          * in a special mapping (all cases which we are happy to fail);
549          * or it may have been a good file page when get_user_pages_fast
550          * found it, but truncated or holepunched or subjected to
551          * invalidate_complete_page2 before we got the page lock (also
552          * cases which we are happy to fail).  And we hold a reference,
553          * so refcount care in invalidate_complete_page's remove_mapping
554          * prevents drop_caches from setting mapping to NULL beneath us.
555          *
556          * The case we do have to guard against is when memory pressure made
557          * shmem_writepage move it from filecache to swapcache beneath us:
558          * an unlikely race, but we do need to retry for page->mapping.
559          */
560         if (unlikely(!mapping)) {
561                 int shmem_swizzled;
562
563                 /*
564                  * Page lock is required to identify which special case above
565                  * applies. If this is really a shmem page then the page lock
566                  * will prevent unexpected transitions.
567                  */
568                 lock_page(page);
569                 shmem_swizzled = PageSwapCache(page) || page->mapping;
570                 unlock_page(page);
571                 put_page(page);
572
573                 if (shmem_swizzled)
574                         goto again;
575
576                 return -EFAULT;
577         }
578
579         /*
580          * Private mappings are handled in a simple way.
581          *
582          * If the futex key is stored on an anonymous page, then the associated
583          * object is the mm which is implicitly pinned by the calling process.
584          *
585          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
586          * it's a read-only handle, it's expected that futexes attach to
587          * the object not the particular process.
588          */
589         if (PageAnon(page)) {
590                 /*
591                  * A RO anonymous page will never change and thus doesn't make
592                  * sense for futex operations.
593                  */
594                 if (unlikely(should_fail_futex(fshared)) || ro) {
595                         err = -EFAULT;
596                         goto out;
597                 }
598
599                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
600                 key->private.mm = mm;
601                 key->private.address = address;
602
603                 get_futex_key_refs(key); /* implies smp_mb(); (B) */
604
605         } else {
606                 struct inode *inode;
607
608                 /*
609                  * The associated futex object in this case is the inode and
610                  * the page->mapping must be traversed. Ordinarily this should
611                  * be stabilised under page lock but it's not strictly
612                  * necessary in this case as we just want to pin the inode, not
613                  * update the radix tree or anything like that.
614                  *
615                  * The RCU read lock is taken as the inode is finally freed
616                  * under RCU. If the mapping still matches expectations then the
617                  * mapping->host can be safely accessed as being a valid inode.
618                  */
619                 rcu_read_lock();
620
621                 if (READ_ONCE(page->mapping) != mapping) {
622                         rcu_read_unlock();
623                         put_page(page);
624
625                         goto again;
626                 }
627
628                 inode = READ_ONCE(mapping->host);
629                 if (!inode) {
630                         rcu_read_unlock();
631                         put_page(page);
632
633                         goto again;
634                 }
635
636                 /*
637                  * Take a reference unless it is about to be freed. Previously
638                  * this reference was taken by ihold under the page lock
639                  * pinning the inode in place so i_lock was unnecessary. The
640                  * only way for this check to fail is if the inode was
641                  * truncated in parallel so warn for now if this happens.
642                  *
643                  * We are not calling into get_futex_key_refs() in file-backed
644                  * cases, therefore a successful atomic_inc return below will
645                  * guarantee that get_futex_key() will still imply smp_mb(); (B).
646                  */
647                 if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
648                         rcu_read_unlock();
649                         put_page(page);
650
651                         goto again;
652                 }
653
654                 /* Should be impossible but lets be paranoid for now */
655                 if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
656                         err = -EFAULT;
657                         rcu_read_unlock();
658                         iput(inode);
659
660                         goto out;
661                 }
662
663                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
664                 key->shared.inode = inode;
665                 key->shared.pgoff = basepage_index(tail);
666                 rcu_read_unlock();
667         }
668
669 out:
670         put_page(page);
671         return err;
672 }
673
674 static inline void put_futex_key(union futex_key *key)
675 {
676         drop_futex_key_refs(key);
677 }
678
679 /**
680  * fault_in_user_writeable() - Fault in user address and verify RW access
681  * @uaddr:      pointer to faulting user space address
682  *
683  * Slow path to fixup the fault we just took in the atomic write
684  * access to @uaddr.
685  *
686  * We have no generic implementation of a non-destructive write to the
687  * user address. We know that we faulted in the atomic pagefault
688  * disabled section so we can as well avoid the #PF overhead by
689  * calling get_user_pages() right away.
690  */
691 static int fault_in_user_writeable(u32 __user *uaddr)
692 {
693         struct mm_struct *mm = current->mm;
694         int ret;
695
696         down_read(&mm->mmap_sem);
697         ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
698                                FAULT_FLAG_WRITE, NULL);
699         up_read(&mm->mmap_sem);
700
701         return ret < 0 ? ret : 0;
702 }
703
704 /**
705  * futex_top_waiter() - Return the highest priority waiter on a futex
706  * @hb:         the hash bucket the futex_q's reside in
707  * @key:        the futex key (to distinguish it from other futex futex_q's)
708  *
709  * Must be called with the hb lock held.
710  */
711 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
712                                         union futex_key *key)
713 {
714         struct futex_q *this;
715
716         plist_for_each_entry(this, &hb->chain, list) {
717                 if (match_futex(&this->key, key))
718                         return this;
719         }
720         return NULL;
721 }
722
723 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
724                                       u32 uval, u32 newval)
725 {
726         int ret;
727
728         pagefault_disable();
729         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
730         pagefault_enable();
731
732         return ret;
733 }
734
735 static int get_futex_value_locked(u32 *dest, u32 __user *from)
736 {
737         int ret;
738
739         pagefault_disable();
740         ret = __get_user(*dest, from);
741         pagefault_enable();
742
743         return ret ? -EFAULT : 0;
744 }
745
746
747 /*
748  * PI code:
749  */
750 static int refill_pi_state_cache(void)
751 {
752         struct futex_pi_state *pi_state;
753
754         if (likely(current->pi_state_cache))
755                 return 0;
756
757         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
758
759         if (!pi_state)
760                 return -ENOMEM;
761
762         INIT_LIST_HEAD(&pi_state->list);
763         /* pi_mutex gets initialized later */
764         pi_state->owner = NULL;
765         atomic_set(&pi_state->refcount, 1);
766         pi_state->key = FUTEX_KEY_INIT;
767
768         current->pi_state_cache = pi_state;
769
770         return 0;
771 }
772
773 static struct futex_pi_state * alloc_pi_state(void)
774 {
775         struct futex_pi_state *pi_state = current->pi_state_cache;
776
777         WARN_ON(!pi_state);
778         current->pi_state_cache = NULL;
779
780         return pi_state;
781 }
782
783 /*
784  * Drops a reference to the pi_state object and frees or caches it
785  * when the last reference is gone.
786  *
787  * Must be called with the hb lock held.
788  */
789 static void put_pi_state(struct futex_pi_state *pi_state)
790 {
791         if (!pi_state)
792                 return;
793
794         if (!atomic_dec_and_test(&pi_state->refcount))
795                 return;
796
797         /*
798          * If pi_state->owner is NULL, the owner is most probably dying
799          * and has cleaned up the pi_state already
800          */
801         if (pi_state->owner) {
802                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
803                 list_del_init(&pi_state->list);
804                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
805
806                 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
807         }
808
809         if (current->pi_state_cache)
810                 kfree(pi_state);
811         else {
812                 /*
813                  * pi_state->list is already empty.
814                  * clear pi_state->owner.
815                  * refcount is at 0 - put it back to 1.
816                  */
817                 pi_state->owner = NULL;
818                 atomic_set(&pi_state->refcount, 1);
819                 current->pi_state_cache = pi_state;
820         }
821 }
822
823 /*
824  * Look up the task based on what TID userspace gave us.
825  * We dont trust it.
826  */
827 static struct task_struct * futex_find_get_task(pid_t pid)
828 {
829         struct task_struct *p;
830
831         rcu_read_lock();
832         p = find_task_by_vpid(pid);
833         if (p)
834                 get_task_struct(p);
835
836         rcu_read_unlock();
837
838         return p;
839 }
840
841 /*
842  * This task is holding PI mutexes at exit time => bad.
843  * Kernel cleans up PI-state, but userspace is likely hosed.
844  * (Robust-futex cleanup is separate and might save the day for userspace.)
845  */
846 void exit_pi_state_list(struct task_struct *curr)
847 {
848         struct list_head *next, *head = &curr->pi_state_list;
849         struct futex_pi_state *pi_state;
850         struct futex_hash_bucket *hb;
851         union futex_key key = FUTEX_KEY_INIT;
852
853         if (!futex_cmpxchg_enabled)
854                 return;
855         /*
856          * We are a ZOMBIE and nobody can enqueue itself on
857          * pi_state_list anymore, but we have to be careful
858          * versus waiters unqueueing themselves:
859          */
860         raw_spin_lock_irq(&curr->pi_lock);
861         while (!list_empty(head)) {
862
863                 next = head->next;
864                 pi_state = list_entry(next, struct futex_pi_state, list);
865                 key = pi_state->key;
866                 hb = hash_futex(&key);
867                 raw_spin_unlock_irq(&curr->pi_lock);
868
869                 spin_lock(&hb->lock);
870
871                 raw_spin_lock_irq(&curr->pi_lock);
872                 /*
873                  * We dropped the pi-lock, so re-check whether this
874                  * task still owns the PI-state:
875                  */
876                 if (head->next != next) {
877                         spin_unlock(&hb->lock);
878                         continue;
879                 }
880
881                 WARN_ON(pi_state->owner != curr);
882                 WARN_ON(list_empty(&pi_state->list));
883                 list_del_init(&pi_state->list);
884                 pi_state->owner = NULL;
885                 raw_spin_unlock_irq(&curr->pi_lock);
886
887                 rt_mutex_unlock(&pi_state->pi_mutex);
888
889                 spin_unlock(&hb->lock);
890
891                 raw_spin_lock_irq(&curr->pi_lock);
892         }
893         raw_spin_unlock_irq(&curr->pi_lock);
894 }
895
896 /*
897  * We need to check the following states:
898  *
899  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
900  *
901  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
902  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
903  *
904  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
905  *
906  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
907  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
908  *
909  * [6]  Found  | Found    | task      | 0         | 1      | Valid
910  *
911  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
912  *
913  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
914  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
915  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
916  *
917  * [1]  Indicates that the kernel can acquire the futex atomically. We
918  *      came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
919  *
920  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
921  *      thread is found then it indicates that the owner TID has died.
922  *
923  * [3]  Invalid. The waiter is queued on a non PI futex
924  *
925  * [4]  Valid state after exit_robust_list(), which sets the user space
926  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
927  *
928  * [5]  The user space value got manipulated between exit_robust_list()
929  *      and exit_pi_state_list()
930  *
931  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
932  *      the pi_state but cannot access the user space value.
933  *
934  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
935  *
936  * [8]  Owner and user space value match
937  *
938  * [9]  There is no transient state which sets the user space TID to 0
939  *      except exit_robust_list(), but this is indicated by the
940  *      FUTEX_OWNER_DIED bit. See [4]
941  *
942  * [10] There is no transient state which leaves owner and user space
943  *      TID out of sync.
944  */
945
946 /*
947  * Validate that the existing waiter has a pi_state and sanity check
948  * the pi_state against the user space value. If correct, attach to
949  * it.
950  */
951 static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
952                               struct futex_pi_state **ps)
953 {
954         pid_t pid = uval & FUTEX_TID_MASK;
955
956         /*
957          * Userspace might have messed up non-PI and PI futexes [3]
958          */
959         if (unlikely(!pi_state))
960                 return -EINVAL;
961
962         WARN_ON(!atomic_read(&pi_state->refcount));
963
964         /*
965          * Handle the owner died case:
966          */
967         if (uval & FUTEX_OWNER_DIED) {
968                 /*
969                  * exit_pi_state_list sets owner to NULL and wakes the
970                  * topmost waiter. The task which acquires the
971                  * pi_state->rt_mutex will fixup owner.
972                  */
973                 if (!pi_state->owner) {
974                         /*
975                          * No pi state owner, but the user space TID
976                          * is not 0. Inconsistent state. [5]
977                          */
978                         if (pid)
979                                 return -EINVAL;
980                         /*
981                          * Take a ref on the state and return success. [4]
982                          */
983                         goto out_state;
984                 }
985
986                 /*
987                  * If TID is 0, then either the dying owner has not
988                  * yet executed exit_pi_state_list() or some waiter
989                  * acquired the rtmutex in the pi state, but did not
990                  * yet fixup the TID in user space.
991                  *
992                  * Take a ref on the state and return success. [6]
993                  */
994                 if (!pid)
995                         goto out_state;
996         } else {
997                 /*
998                  * If the owner died bit is not set, then the pi_state
999                  * must have an owner. [7]
1000                  */
1001                 if (!pi_state->owner)
1002                         return -EINVAL;
1003         }
1004
1005         /*
1006          * Bail out if user space manipulated the futex value. If pi
1007          * state exists then the owner TID must be the same as the
1008          * user space TID. [9/10]
1009          */
1010         if (pid != task_pid_vnr(pi_state->owner))
1011                 return -EINVAL;
1012 out_state:
1013         atomic_inc(&pi_state->refcount);
1014         *ps = pi_state;
1015         return 0;
1016 }
1017
1018 /*
1019  * Lookup the task for the TID provided from user space and attach to
1020  * it after doing proper sanity checks.
1021  */
1022 static int attach_to_pi_owner(u32 uval, union futex_key *key,
1023                               struct futex_pi_state **ps)
1024 {
1025         pid_t pid = uval & FUTEX_TID_MASK;
1026         struct futex_pi_state *pi_state;
1027         struct task_struct *p;
1028
1029         /*
1030          * We are the first waiter - try to look up the real owner and attach
1031          * the new pi_state to it, but bail out when TID = 0 [1]
1032          */
1033         if (!pid)
1034                 return -ESRCH;
1035         p = futex_find_get_task(pid);
1036         if (!p)
1037                 return -ESRCH;
1038
1039         if (unlikely(p->flags & PF_KTHREAD)) {
1040                 put_task_struct(p);
1041                 return -EPERM;
1042         }
1043
1044         /*
1045          * We need to look at the task state flags to figure out,
1046          * whether the task is exiting. To protect against the do_exit
1047          * change of the task flags, we do this protected by
1048          * p->pi_lock:
1049          */
1050         raw_spin_lock_irq(&p->pi_lock);
1051         if (unlikely(p->flags & PF_EXITING)) {
1052                 /*
1053                  * The task is on the way out. When PF_EXITPIDONE is
1054                  * set, we know that the task has finished the
1055                  * cleanup:
1056                  */
1057                 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
1058
1059                 raw_spin_unlock_irq(&p->pi_lock);
1060                 put_task_struct(p);
1061                 return ret;
1062         }
1063
1064         /*
1065          * No existing pi state. First waiter. [2]
1066          */
1067         pi_state = alloc_pi_state();
1068
1069         /*
1070          * Initialize the pi_mutex in locked state and make @p
1071          * the owner of it:
1072          */
1073         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1074
1075         /* Store the key for possible exit cleanups: */
1076         pi_state->key = *key;
1077
1078         WARN_ON(!list_empty(&pi_state->list));
1079         list_add(&pi_state->list, &p->pi_state_list);
1080         pi_state->owner = p;
1081         raw_spin_unlock_irq(&p->pi_lock);
1082
1083         put_task_struct(p);
1084
1085         *ps = pi_state;
1086
1087         return 0;
1088 }
1089
1090 static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
1091                            union futex_key *key, struct futex_pi_state **ps)
1092 {
1093         struct futex_q *match = futex_top_waiter(hb, key);
1094
1095         /*
1096          * If there is a waiter on that futex, validate it and
1097          * attach to the pi_state when the validation succeeds.
1098          */
1099         if (match)
1100                 return attach_to_pi_state(uval, match->pi_state, ps);
1101
1102         /*
1103          * We are the first waiter - try to look up the owner based on
1104          * @uval and attach to it.
1105          */
1106         return attach_to_pi_owner(uval, key, ps);
1107 }
1108
1109 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1110 {
1111         u32 uninitialized_var(curval);
1112
1113         if (unlikely(should_fail_futex(true)))
1114                 return -EFAULT;
1115
1116         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
1117                 return -EFAULT;
1118
1119         /*If user space value changed, let the caller retry */
1120         return curval != uval ? -EAGAIN : 0;
1121 }
1122
1123 /**
1124  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1125  * @uaddr:              the pi futex user address
1126  * @hb:                 the pi futex hash bucket
1127  * @key:                the futex key associated with uaddr and hb
1128  * @ps:                 the pi_state pointer where we store the result of the
1129  *                      lookup
1130  * @task:               the task to perform the atomic lock work for.  This will
1131  *                      be "current" except in the case of requeue pi.
1132  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1133  *
1134  * Return:
1135  *  0 - ready to wait;
1136  *  1 - acquired the lock;
1137  * <0 - error
1138  *
1139  * The hb->lock and futex_key refs shall be held by the caller.
1140  */
1141 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1142                                 union futex_key *key,
1143                                 struct futex_pi_state **ps,
1144                                 struct task_struct *task, int set_waiters)
1145 {
1146         u32 uval, newval, vpid = task_pid_vnr(task);
1147         struct futex_q *match;
1148         int ret;
1149
1150         /*
1151          * Read the user space value first so we can validate a few
1152          * things before proceeding further.
1153          */
1154         if (get_futex_value_locked(&uval, uaddr))
1155                 return -EFAULT;
1156
1157         if (unlikely(should_fail_futex(true)))
1158                 return -EFAULT;
1159
1160         /*
1161          * Detect deadlocks.
1162          */
1163         if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1164                 return -EDEADLK;
1165
1166         if ((unlikely(should_fail_futex(true))))
1167                 return -EDEADLK;
1168
1169         /*
1170          * Lookup existing state first. If it exists, try to attach to
1171          * its pi_state.
1172          */
1173         match = futex_top_waiter(hb, key);
1174         if (match)
1175                 return attach_to_pi_state(uval, match->pi_state, ps);
1176
1177         /*
1178          * No waiter and user TID is 0. We are here because the
1179          * waiters or the owner died bit is set or called from
1180          * requeue_cmp_pi or for whatever reason something took the
1181          * syscall.
1182          */
1183         if (!(uval & FUTEX_TID_MASK)) {
1184                 /*
1185                  * We take over the futex. No other waiters and the user space
1186                  * TID is 0. We preserve the owner died bit.
1187                  */
1188                 newval = uval & FUTEX_OWNER_DIED;
1189                 newval |= vpid;
1190
1191                 /* The futex requeue_pi code can enforce the waiters bit */
1192                 if (set_waiters)
1193                         newval |= FUTEX_WAITERS;
1194
1195                 ret = lock_pi_update_atomic(uaddr, uval, newval);
1196                 /* If the take over worked, return 1 */
1197                 return ret < 0 ? ret : 1;
1198         }
1199
1200         /*
1201          * First waiter. Set the waiters bit before attaching ourself to
1202          * the owner. If owner tries to unlock, it will be forced into
1203          * the kernel and blocked on hb->lock.
1204          */
1205         newval = uval | FUTEX_WAITERS;
1206         ret = lock_pi_update_atomic(uaddr, uval, newval);
1207         if (ret)
1208                 return ret;
1209         /*
1210          * If the update of the user space value succeeded, we try to
1211          * attach to the owner. If that fails, no harm done, we only
1212          * set the FUTEX_WAITERS bit in the user space variable.
1213          */
1214         return attach_to_pi_owner(uval, key, ps);
1215 }
1216
1217 /**
1218  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1219  * @q:  The futex_q to unqueue
1220  *
1221  * The q->lock_ptr must not be NULL and must be held by the caller.
1222  */
1223 static void __unqueue_futex(struct futex_q *q)
1224 {
1225         struct futex_hash_bucket *hb;
1226
1227         if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
1228             || WARN_ON(plist_node_empty(&q->list)))
1229                 return;
1230
1231         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1232         plist_del(&q->list, &hb->chain);
1233         hb_waiters_dec(hb);
1234 }
1235
1236 /*
1237  * The hash bucket lock must be held when this is called.
1238  * Afterwards, the futex_q must not be accessed. Callers
1239  * must ensure to later call wake_up_q() for the actual
1240  * wakeups to occur.
1241  */
1242 static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1243 {
1244         struct task_struct *p = q->task;
1245
1246         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1247                 return;
1248
1249         /*
1250          * Queue the task for later wakeup for after we've released
1251          * the hb->lock. wake_q_add() grabs reference to p.
1252          */
1253         wake_q_add(wake_q, p);
1254         __unqueue_futex(q);
1255         /*
1256          * The waiting task can free the futex_q as soon as
1257          * q->lock_ptr = NULL is written, without taking any locks. A
1258          * memory barrier is required here to prevent the following
1259          * store to lock_ptr from getting ahead of the plist_del.
1260          */
1261         smp_wmb();
1262         q->lock_ptr = NULL;
1263 }
1264
1265 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
1266                          struct futex_hash_bucket *hb)
1267 {
1268         struct task_struct *new_owner;
1269         struct futex_pi_state *pi_state = this->pi_state;
1270         u32 uninitialized_var(curval), newval;
1271         WAKE_Q(wake_q);
1272         bool deboost;
1273         int ret = 0;
1274
1275         if (!pi_state)
1276                 return -EINVAL;
1277
1278         /*
1279          * If current does not own the pi_state then the futex is
1280          * inconsistent and user space fiddled with the futex value.
1281          */
1282         if (pi_state->owner != current)
1283                 return -EINVAL;
1284
1285         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1286         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1287
1288         /*
1289          * It is possible that the next waiter (the one that brought
1290          * this owner to the kernel) timed out and is no longer
1291          * waiting on the lock.
1292          */
1293         if (!new_owner)
1294                 new_owner = this->task;
1295
1296         /*
1297          * We pass it to the next owner. The WAITERS bit is always
1298          * kept enabled while there is PI state around. We cleanup the
1299          * owner died bit, because we are the owner.
1300          */
1301         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1302
1303         if (unlikely(should_fail_futex(true)))
1304                 ret = -EFAULT;
1305
1306         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
1307                 ret = -EFAULT;
1308         } else if (curval != uval) {
1309                 /*
1310                  * If a unconditional UNLOCK_PI operation (user space did not
1311                  * try the TID->0 transition) raced with a waiter setting the
1312                  * FUTEX_WAITERS flag between get_user() and locking the hash
1313                  * bucket lock, retry the operation.
1314                  */
1315                 if ((FUTEX_TID_MASK & curval) == uval)
1316                         ret = -EAGAIN;
1317                 else
1318                         ret = -EINVAL;
1319         }
1320         if (ret) {
1321                 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1322                 return ret;
1323         }
1324
1325         raw_spin_lock(&pi_state->owner->pi_lock);
1326         WARN_ON(list_empty(&pi_state->list));
1327         list_del_init(&pi_state->list);
1328         raw_spin_unlock(&pi_state->owner->pi_lock);
1329
1330         raw_spin_lock(&new_owner->pi_lock);
1331         WARN_ON(!list_empty(&pi_state->list));
1332         list_add(&pi_state->list, &new_owner->pi_state_list);
1333         pi_state->owner = new_owner;
1334         raw_spin_unlock(&new_owner->pi_lock);
1335
1336         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1337
1338         deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1339
1340         /*
1341          * First unlock HB so the waiter does not spin on it once he got woken
1342          * up. Second wake up the waiter before the priority is adjusted. If we
1343          * deboost first (and lose our higher priority), then the task might get
1344          * scheduled away before the wake up can take place.
1345          */
1346         spin_unlock(&hb->lock);
1347         wake_up_q(&wake_q);
1348         if (deboost)
1349                 rt_mutex_adjust_prio(current);
1350
1351         return 0;
1352 }
1353
1354 /*
1355  * Express the locking dependencies for lockdep:
1356  */
1357 static inline void
1358 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1359 {
1360         if (hb1 <= hb2) {
1361                 spin_lock(&hb1->lock);
1362                 if (hb1 < hb2)
1363                         spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1364         } else { /* hb1 > hb2 */
1365                 spin_lock(&hb2->lock);
1366                 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1367         }
1368 }
1369
1370 static inline void
1371 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1372 {
1373         spin_unlock(&hb1->lock);
1374         if (hb1 != hb2)
1375                 spin_unlock(&hb2->lock);
1376 }
1377
1378 /*
1379  * Wake up waiters matching bitset queued on this futex (uaddr).
1380  */
1381 static int
1382 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1383 {
1384         struct futex_hash_bucket *hb;
1385         struct futex_q *this, *next;
1386         union futex_key key = FUTEX_KEY_INIT;
1387         int ret;
1388         WAKE_Q(wake_q);
1389
1390         if (!bitset)
1391                 return -EINVAL;
1392
1393         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1394         if (unlikely(ret != 0))
1395                 goto out;
1396
1397         hb = hash_futex(&key);
1398
1399         /* Make sure we really have tasks to wakeup */
1400         if (!hb_waiters_pending(hb))
1401                 goto out_put_key;
1402
1403         spin_lock(&hb->lock);
1404
1405         plist_for_each_entry_safe(this, next, &hb->chain, list) {
1406                 if (match_futex (&this->key, &key)) {
1407                         if (this->pi_state || this->rt_waiter) {
1408                                 ret = -EINVAL;
1409                                 break;
1410                         }
1411
1412                         /* Check if one of the bits is set in both bitsets */
1413                         if (!(this->bitset & bitset))
1414                                 continue;
1415
1416                         mark_wake_futex(&wake_q, this);
1417                         if (++ret >= nr_wake)
1418                                 break;
1419                 }
1420         }
1421
1422         spin_unlock(&hb->lock);
1423         wake_up_q(&wake_q);
1424 out_put_key:
1425         put_futex_key(&key);
1426 out:
1427         return ret;
1428 }
1429
1430 /*
1431  * Wake up all waiters hashed on the physical page that is mapped
1432  * to this virtual address:
1433  */
1434 static int
1435 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1436               int nr_wake, int nr_wake2, int op)
1437 {
1438         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1439         struct futex_hash_bucket *hb1, *hb2;
1440         struct futex_q *this, *next;
1441         int ret, op_ret;
1442         WAKE_Q(wake_q);
1443
1444 retry:
1445         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1446         if (unlikely(ret != 0))
1447                 goto out;
1448         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1449         if (unlikely(ret != 0))
1450                 goto out_put_key1;
1451
1452         hb1 = hash_futex(&key1);
1453         hb2 = hash_futex(&key2);
1454
1455 retry_private:
1456         double_lock_hb(hb1, hb2);
1457         op_ret = futex_atomic_op_inuser(op, uaddr2);
1458         if (unlikely(op_ret < 0)) {
1459
1460                 double_unlock_hb(hb1, hb2);
1461
1462 #ifndef CONFIG_MMU
1463                 /*
1464                  * we don't get EFAULT from MMU faults if we don't have an MMU,
1465                  * but we might get them from range checking
1466                  */
1467                 ret = op_ret;
1468                 goto out_put_keys;
1469 #endif
1470
1471                 if (unlikely(op_ret != -EFAULT)) {
1472                         ret = op_ret;
1473                         goto out_put_keys;
1474                 }
1475
1476                 ret = fault_in_user_writeable(uaddr2);
1477                 if (ret)
1478                         goto out_put_keys;
1479
1480                 if (!(flags & FLAGS_SHARED))
1481                         goto retry_private;
1482
1483                 put_futex_key(&key2);
1484                 put_futex_key(&key1);
1485                 goto retry;
1486         }
1487
1488         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1489                 if (match_futex (&this->key, &key1)) {
1490                         if (this->pi_state || this->rt_waiter) {
1491                                 ret = -EINVAL;
1492                                 goto out_unlock;
1493                         }
1494                         mark_wake_futex(&wake_q, this);
1495                         if (++ret >= nr_wake)
1496                                 break;
1497                 }
1498         }
1499
1500         if (op_ret > 0) {
1501                 op_ret = 0;
1502                 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1503                         if (match_futex (&this->key, &key2)) {
1504                                 if (this->pi_state || this->rt_waiter) {
1505                                         ret = -EINVAL;
1506                                         goto out_unlock;
1507                                 }
1508                                 mark_wake_futex(&wake_q, this);
1509                                 if (++op_ret >= nr_wake2)
1510                                         break;
1511                         }
1512                 }
1513                 ret += op_ret;
1514         }
1515
1516 out_unlock:
1517         double_unlock_hb(hb1, hb2);
1518         wake_up_q(&wake_q);
1519 out_put_keys:
1520         put_futex_key(&key2);
1521 out_put_key1:
1522         put_futex_key(&key1);
1523 out:
1524         return ret;
1525 }
1526
1527 /**
1528  * requeue_futex() - Requeue a futex_q from one hb to another
1529  * @q:          the futex_q to requeue
1530  * @hb1:        the source hash_bucket
1531  * @hb2:        the target hash_bucket
1532  * @key2:       the new key for the requeued futex_q
1533  */
1534 static inline
1535 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1536                    struct futex_hash_bucket *hb2, union futex_key *key2)
1537 {
1538
1539         /*
1540          * If key1 and key2 hash to the same bucket, no need to
1541          * requeue.
1542          */
1543         if (likely(&hb1->chain != &hb2->chain)) {
1544                 plist_del(&q->list, &hb1->chain);
1545                 hb_waiters_dec(hb1);
1546                 hb_waiters_inc(hb2);
1547                 plist_add(&q->list, &hb2->chain);
1548                 q->lock_ptr = &hb2->lock;
1549         }
1550         get_futex_key_refs(key2);
1551         q->key = *key2;
1552 }
1553
1554 /**
1555  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1556  * @q:          the futex_q
1557  * @key:        the key of the requeue target futex
1558  * @hb:         the hash_bucket of the requeue target futex
1559  *
1560  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1561  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1562  * to the requeue target futex so the waiter can detect the wakeup on the right
1563  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1564  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1565  * to protect access to the pi_state to fixup the owner later.  Must be called
1566  * with both q->lock_ptr and hb->lock held.
1567  */
1568 static inline
1569 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1570                            struct futex_hash_bucket *hb)
1571 {
1572         get_futex_key_refs(key);
1573         q->key = *key;
1574
1575         __unqueue_futex(q);
1576
1577         WARN_ON(!q->rt_waiter);
1578         q->rt_waiter = NULL;
1579
1580         q->lock_ptr = &hb->lock;
1581
1582         wake_up_state(q->task, TASK_NORMAL);
1583 }
1584
1585 /**
1586  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1587  * @pifutex:            the user address of the to futex
1588  * @hb1:                the from futex hash bucket, must be locked by the caller
1589  * @hb2:                the to futex hash bucket, must be locked by the caller
1590  * @key1:               the from futex key
1591  * @key2:               the to futex key
1592  * @ps:                 address to store the pi_state pointer
1593  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1594  *
1595  * Try and get the lock on behalf of the top waiter if we can do it atomically.
1596  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1597  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1598  * hb1 and hb2 must be held by the caller.
1599  *
1600  * Return:
1601  *  0 - failed to acquire the lock atomically;
1602  * >0 - acquired the lock, return value is vpid of the top_waiter
1603  * <0 - error
1604  */
1605 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1606                                  struct futex_hash_bucket *hb1,
1607                                  struct futex_hash_bucket *hb2,
1608                                  union futex_key *key1, union futex_key *key2,
1609                                  struct futex_pi_state **ps, int set_waiters)
1610 {
1611         struct futex_q *top_waiter = NULL;
1612         u32 curval;
1613         int ret, vpid;
1614
1615         if (get_futex_value_locked(&curval, pifutex))
1616                 return -EFAULT;
1617
1618         if (unlikely(should_fail_futex(true)))
1619                 return -EFAULT;
1620
1621         /*
1622          * Find the top_waiter and determine if there are additional waiters.
1623          * If the caller intends to requeue more than 1 waiter to pifutex,
1624          * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1625          * as we have means to handle the possible fault.  If not, don't set
1626          * the bit unecessarily as it will force the subsequent unlock to enter
1627          * the kernel.
1628          */
1629         top_waiter = futex_top_waiter(hb1, key1);
1630
1631         /* There are no waiters, nothing for us to do. */
1632         if (!top_waiter)
1633                 return 0;
1634
1635         /* Ensure we requeue to the expected futex. */
1636         if (!match_futex(top_waiter->requeue_pi_key, key2))
1637                 return -EINVAL;
1638
1639         /*
1640          * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1641          * the contended case or if set_waiters is 1.  The pi_state is returned
1642          * in ps in contended cases.
1643          */
1644         vpid = task_pid_vnr(top_waiter->task);
1645         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1646                                    set_waiters);
1647         if (ret == 1) {
1648                 requeue_pi_wake_futex(top_waiter, key2, hb2);
1649                 return vpid;
1650         }
1651         return ret;
1652 }
1653
1654 /**
1655  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1656  * @uaddr1:     source futex user address
1657  * @flags:      futex flags (FLAGS_SHARED, etc.)
1658  * @uaddr2:     target futex user address
1659  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1660  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1661  * @cmpval:     @uaddr1 expected value (or %NULL)
1662  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1663  *              pi futex (pi to pi requeue is not supported)
1664  *
1665  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1666  * uaddr2 atomically on behalf of the top waiter.
1667  *
1668  * Return:
1669  * >=0 - on success, the number of tasks requeued or woken;
1670  *  <0 - on error
1671  */
1672 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1673                          u32 __user *uaddr2, int nr_wake, int nr_requeue,
1674                          u32 *cmpval, int requeue_pi)
1675 {
1676         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1677         int drop_count = 0, task_count = 0, ret;
1678         struct futex_pi_state *pi_state = NULL;
1679         struct futex_hash_bucket *hb1, *hb2;
1680         struct futex_q *this, *next;
1681         WAKE_Q(wake_q);
1682
1683         if (requeue_pi) {
1684                 /*
1685                  * Requeue PI only works on two distinct uaddrs. This
1686                  * check is only valid for private futexes. See below.
1687                  */
1688                 if (uaddr1 == uaddr2)
1689                         return -EINVAL;
1690
1691                 /*
1692                  * requeue_pi requires a pi_state, try to allocate it now
1693                  * without any locks in case it fails.
1694                  */
1695                 if (refill_pi_state_cache())
1696                         return -ENOMEM;
1697                 /*
1698                  * requeue_pi must wake as many tasks as it can, up to nr_wake
1699                  * + nr_requeue, since it acquires the rt_mutex prior to
1700                  * returning to userspace, so as to not leave the rt_mutex with
1701                  * waiters and no owner.  However, second and third wake-ups
1702                  * cannot be predicted as they involve race conditions with the
1703                  * first wake and a fault while looking up the pi_state.  Both
1704                  * pthread_cond_signal() and pthread_cond_broadcast() should
1705                  * use nr_wake=1.
1706                  */
1707                 if (nr_wake != 1)
1708                         return -EINVAL;
1709         }
1710
1711 retry:
1712         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1713         if (unlikely(ret != 0))
1714                 goto out;
1715         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1716                             requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1717         if (unlikely(ret != 0))
1718                 goto out_put_key1;
1719
1720         /*
1721          * The check above which compares uaddrs is not sufficient for
1722          * shared futexes. We need to compare the keys:
1723          */
1724         if (requeue_pi && match_futex(&key1, &key2)) {
1725                 ret = -EINVAL;
1726                 goto out_put_keys;
1727         }
1728
1729         hb1 = hash_futex(&key1);
1730         hb2 = hash_futex(&key2);
1731
1732 retry_private:
1733         hb_waiters_inc(hb2);
1734         double_lock_hb(hb1, hb2);
1735
1736         if (likely(cmpval != NULL)) {
1737                 u32 curval;
1738
1739                 ret = get_futex_value_locked(&curval, uaddr1);
1740
1741                 if (unlikely(ret)) {
1742                         double_unlock_hb(hb1, hb2);
1743                         hb_waiters_dec(hb2);
1744
1745                         ret = get_user(curval, uaddr1);
1746                         if (ret)
1747                                 goto out_put_keys;
1748
1749                         if (!(flags & FLAGS_SHARED))
1750                                 goto retry_private;
1751
1752                         put_futex_key(&key2);
1753                         put_futex_key(&key1);
1754                         goto retry;
1755                 }
1756                 if (curval != *cmpval) {
1757                         ret = -EAGAIN;
1758                         goto out_unlock;
1759                 }
1760         }
1761
1762         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1763                 /*
1764                  * Attempt to acquire uaddr2 and wake the top waiter. If we
1765                  * intend to requeue waiters, force setting the FUTEX_WAITERS
1766                  * bit.  We force this here where we are able to easily handle
1767                  * faults rather in the requeue loop below.
1768                  */
1769                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1770                                                  &key2, &pi_state, nr_requeue);
1771
1772                 /*
1773                  * At this point the top_waiter has either taken uaddr2 or is
1774                  * waiting on it.  If the former, then the pi_state will not
1775                  * exist yet, look it up one more time to ensure we have a
1776                  * reference to it. If the lock was taken, ret contains the
1777                  * vpid of the top waiter task.
1778                  * If the lock was not taken, we have pi_state and an initial
1779                  * refcount on it. In case of an error we have nothing.
1780                  */
1781                 if (ret > 0) {
1782                         WARN_ON(pi_state);
1783                         drop_count++;
1784                         task_count++;
1785                         /*
1786                          * If we acquired the lock, then the user space value
1787                          * of uaddr2 should be vpid. It cannot be changed by
1788                          * the top waiter as it is blocked on hb2 lock if it
1789                          * tries to do so. If something fiddled with it behind
1790                          * our back the pi state lookup might unearth it. So
1791                          * we rather use the known value than rereading and
1792                          * handing potential crap to lookup_pi_state.
1793                          *
1794                          * If that call succeeds then we have pi_state and an
1795                          * initial refcount on it.
1796                          */
1797                         ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1798                 }
1799
1800                 switch (ret) {
1801                 case 0:
1802                         /* We hold a reference on the pi state. */
1803                         break;
1804
1805                         /* If the above failed, then pi_state is NULL */
1806                 case -EFAULT:
1807                         double_unlock_hb(hb1, hb2);
1808                         hb_waiters_dec(hb2);
1809                         put_futex_key(&key2);
1810                         put_futex_key(&key1);
1811                         ret = fault_in_user_writeable(uaddr2);
1812                         if (!ret)
1813                                 goto retry;
1814                         goto out;
1815                 case -EAGAIN:
1816                         /*
1817                          * Two reasons for this:
1818                          * - Owner is exiting and we just wait for the
1819                          *   exit to complete.
1820                          * - The user space value changed.
1821                          */
1822                         double_unlock_hb(hb1, hb2);
1823                         hb_waiters_dec(hb2);
1824                         put_futex_key(&key2);
1825                         put_futex_key(&key1);
1826                         cond_resched();
1827                         goto retry;
1828                 default:
1829                         goto out_unlock;
1830                 }
1831         }
1832
1833         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1834                 if (task_count - nr_wake >= nr_requeue)
1835                         break;
1836
1837                 if (!match_futex(&this->key, &key1))
1838                         continue;
1839
1840                 /*
1841                  * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1842                  * be paired with each other and no other futex ops.
1843                  *
1844                  * We should never be requeueing a futex_q with a pi_state,
1845                  * which is awaiting a futex_unlock_pi().
1846                  */
1847                 if ((requeue_pi && !this->rt_waiter) ||
1848                     (!requeue_pi && this->rt_waiter) ||
1849                     this->pi_state) {
1850                         ret = -EINVAL;
1851                         break;
1852                 }
1853
1854                 /*
1855                  * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1856                  * lock, we already woke the top_waiter.  If not, it will be
1857                  * woken by futex_unlock_pi().
1858                  */
1859                 if (++task_count <= nr_wake && !requeue_pi) {
1860                         mark_wake_futex(&wake_q, this);
1861                         continue;
1862                 }
1863
1864                 /* Ensure we requeue to the expected futex for requeue_pi. */
1865                 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1866                         ret = -EINVAL;
1867                         break;
1868                 }
1869
1870                 /*
1871                  * Requeue nr_requeue waiters and possibly one more in the case
1872                  * of requeue_pi if we couldn't acquire the lock atomically.
1873                  */
1874                 if (requeue_pi) {
1875                         /*
1876                          * Prepare the waiter to take the rt_mutex. Take a
1877                          * refcount on the pi_state and store the pointer in
1878                          * the futex_q object of the waiter.
1879                          */
1880                         atomic_inc(&pi_state->refcount);
1881                         this->pi_state = pi_state;
1882                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1883                                                         this->rt_waiter,
1884                                                         this->task);
1885                         if (ret == 1) {
1886                                 /*
1887                                  * We got the lock. We do neither drop the
1888                                  * refcount on pi_state nor clear
1889                                  * this->pi_state because the waiter needs the
1890                                  * pi_state for cleaning up the user space
1891                                  * value. It will drop the refcount after
1892                                  * doing so.
1893                                  */
1894                                 requeue_pi_wake_futex(this, &key2, hb2);
1895                                 drop_count++;
1896                                 continue;
1897                         } else if (ret) {
1898                                 /*
1899                                  * rt_mutex_start_proxy_lock() detected a
1900                                  * potential deadlock when we tried to queue
1901                                  * that waiter. Drop the pi_state reference
1902                                  * which we took above and remove the pointer
1903                                  * to the state from the waiters futex_q
1904                                  * object.
1905                                  */
1906                                 this->pi_state = NULL;
1907                                 put_pi_state(pi_state);
1908                                 /*
1909                                  * We stop queueing more waiters and let user
1910                                  * space deal with the mess.
1911                                  */
1912                                 break;
1913                         }
1914                 }
1915                 requeue_futex(this, hb1, hb2, &key2);
1916                 drop_count++;
1917         }
1918
1919         /*
1920          * We took an extra initial reference to the pi_state either
1921          * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
1922          * need to drop it here again.
1923          */
1924         put_pi_state(pi_state);
1925
1926 out_unlock:
1927         double_unlock_hb(hb1, hb2);
1928         wake_up_q(&wake_q);
1929         hb_waiters_dec(hb2);
1930
1931         /*
1932          * drop_futex_key_refs() must be called outside the spinlocks. During
1933          * the requeue we moved futex_q's from the hash bucket at key1 to the
1934          * one at key2 and updated their key pointer.  We no longer need to
1935          * hold the references to key1.
1936          */
1937         while (--drop_count >= 0)
1938                 drop_futex_key_refs(&key1);
1939
1940 out_put_keys:
1941         put_futex_key(&key2);
1942 out_put_key1:
1943         put_futex_key(&key1);
1944 out:
1945         return ret ? ret : task_count;
1946 }
1947
1948 /* The key must be already stored in q->key. */
1949 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1950         __acquires(&hb->lock)
1951 {
1952         struct futex_hash_bucket *hb;
1953
1954         hb = hash_futex(&q->key);
1955
1956         /*
1957          * Increment the counter before taking the lock so that
1958          * a potential waker won't miss a to-be-slept task that is
1959          * waiting for the spinlock. This is safe as all queue_lock()
1960          * users end up calling queue_me(). Similarly, for housekeeping,
1961          * decrement the counter at queue_unlock() when some error has
1962          * occurred and we don't end up adding the task to the list.
1963          */
1964         hb_waiters_inc(hb);
1965
1966         q->lock_ptr = &hb->lock;
1967
1968         spin_lock(&hb->lock); /* implies smp_mb(); (A) */
1969         return hb;
1970 }
1971
1972 static inline void
1973 queue_unlock(struct futex_hash_bucket *hb)
1974         __releases(&hb->lock)
1975 {
1976         spin_unlock(&hb->lock);
1977         hb_waiters_dec(hb);
1978 }
1979
1980 /**
1981  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1982  * @q:  The futex_q to enqueue
1983  * @hb: The destination hash bucket
1984  *
1985  * The hb->lock must be held by the caller, and is released here. A call to
1986  * queue_me() is typically paired with exactly one call to unqueue_me().  The
1987  * exceptions involve the PI related operations, which may use unqueue_me_pi()
1988  * or nothing if the unqueue is done as part of the wake process and the unqueue
1989  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1990  * an example).
1991  */
1992 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1993         __releases(&hb->lock)
1994 {
1995         int prio;
1996
1997         /*
1998          * The priority used to register this element is
1999          * - either the real thread-priority for the real-time threads
2000          * (i.e. threads with a priority lower than MAX_RT_PRIO)
2001          * - or MAX_RT_PRIO for non-RT threads.
2002          * Thus, all RT-threads are woken first in priority order, and
2003          * the others are woken last, in FIFO order.
2004          */
2005         prio = min(current->normal_prio, MAX_RT_PRIO);
2006
2007         plist_node_init(&q->list, prio);
2008         plist_add(&q->list, &hb->chain);
2009         q->task = current;
2010         spin_unlock(&hb->lock);
2011 }
2012
2013 /**
2014  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2015  * @q:  The futex_q to unqueue
2016  *
2017  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2018  * be paired with exactly one earlier call to queue_me().
2019  *
2020  * Return:
2021  *   1 - if the futex_q was still queued (and we removed unqueued it);
2022  *   0 - if the futex_q was already removed by the waking thread
2023  */
2024 static int unqueue_me(struct futex_q *q)
2025 {
2026         spinlock_t *lock_ptr;
2027         int ret = 0;
2028
2029         /* In the common case we don't take the spinlock, which is nice. */
2030 retry:
2031         /*
2032          * q->lock_ptr can change between this read and the following spin_lock.
2033          * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2034          * optimizing lock_ptr out of the logic below.
2035          */
2036         lock_ptr = READ_ONCE(q->lock_ptr);
2037         if (lock_ptr != NULL) {
2038                 spin_lock(lock_ptr);
2039                 /*
2040                  * q->lock_ptr can change between reading it and
2041                  * spin_lock(), causing us to take the wrong lock.  This
2042                  * corrects the race condition.
2043                  *
2044                  * Reasoning goes like this: if we have the wrong lock,
2045                  * q->lock_ptr must have changed (maybe several times)
2046                  * between reading it and the spin_lock().  It can
2047                  * change again after the spin_lock() but only if it was
2048                  * already changed before the spin_lock().  It cannot,
2049                  * however, change back to the original value.  Therefore
2050                  * we can detect whether we acquired the correct lock.
2051                  */
2052                 if (unlikely(lock_ptr != q->lock_ptr)) {
2053                         spin_unlock(lock_ptr);
2054                         goto retry;
2055                 }
2056                 __unqueue_futex(q);
2057
2058                 BUG_ON(q->pi_state);
2059
2060                 spin_unlock(lock_ptr);
2061                 ret = 1;
2062         }
2063
2064         drop_futex_key_refs(&q->key);
2065         return ret;
2066 }
2067
2068 /*
2069  * PI futexes can not be requeued and must remove themself from the
2070  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
2071  * and dropped here.
2072  */
2073 static void unqueue_me_pi(struct futex_q *q)
2074         __releases(q->lock_ptr)
2075 {
2076         __unqueue_futex(q);
2077
2078         BUG_ON(!q->pi_state);
2079         put_pi_state(q->pi_state);
2080         q->pi_state = NULL;
2081
2082         spin_unlock(q->lock_ptr);
2083 }
2084
2085 /*
2086  * Fixup the pi_state owner with the new owner.
2087  *
2088  * Must be called with hash bucket lock held and mm->sem held for non
2089  * private futexes.
2090  */
2091 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2092                                 struct task_struct *newowner)
2093 {
2094         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2095         struct futex_pi_state *pi_state = q->pi_state;
2096         struct task_struct *oldowner = pi_state->owner;
2097         u32 uval, uninitialized_var(curval), newval;
2098         int ret;
2099
2100         /* Owner died? */
2101         if (!pi_state->owner)
2102                 newtid |= FUTEX_OWNER_DIED;
2103
2104         /*
2105          * We are here either because we stole the rtmutex from the
2106          * previous highest priority waiter or we are the highest priority
2107          * waiter but failed to get the rtmutex the first time.
2108          * We have to replace the newowner TID in the user space variable.
2109          * This must be atomic as we have to preserve the owner died bit here.
2110          *
2111          * Note: We write the user space value _before_ changing the pi_state
2112          * because we can fault here. Imagine swapped out pages or a fork
2113          * that marked all the anonymous memory readonly for cow.
2114          *
2115          * Modifying pi_state _before_ the user space value would
2116          * leave the pi_state in an inconsistent state when we fault
2117          * here, because we need to drop the hash bucket lock to
2118          * handle the fault. This might be observed in the PID check
2119          * in lookup_pi_state.
2120          */
2121 retry:
2122         if (get_futex_value_locked(&uval, uaddr))
2123                 goto handle_fault;
2124
2125         while (1) {
2126                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
2127
2128                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
2129                         goto handle_fault;
2130                 if (curval == uval)
2131                         break;
2132                 uval = curval;
2133         }
2134
2135         /*
2136          * We fixed up user space. Now we need to fix the pi_state
2137          * itself.
2138          */
2139         if (pi_state->owner != NULL) {
2140                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
2141                 WARN_ON(list_empty(&pi_state->list));
2142                 list_del_init(&pi_state->list);
2143                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
2144         }
2145
2146         pi_state->owner = newowner;
2147
2148         raw_spin_lock_irq(&newowner->pi_lock);
2149         WARN_ON(!list_empty(&pi_state->list));
2150         list_add(&pi_state->list, &newowner->pi_state_list);
2151         raw_spin_unlock_irq(&newowner->pi_lock);
2152         return 0;
2153
2154         /*
2155          * To handle the page fault we need to drop the hash bucket
2156          * lock here. That gives the other task (either the highest priority
2157          * waiter itself or the task which stole the rtmutex) the
2158          * chance to try the fixup of the pi_state. So once we are
2159          * back from handling the fault we need to check the pi_state
2160          * after reacquiring the hash bucket lock and before trying to
2161          * do another fixup. When the fixup has been done already we
2162          * simply return.
2163          */
2164 handle_fault:
2165         spin_unlock(q->lock_ptr);
2166
2167         ret = fault_in_user_writeable(uaddr);
2168
2169         spin_lock(q->lock_ptr);
2170
2171         /*
2172          * Check if someone else fixed it for us:
2173          */
2174         if (pi_state->owner != oldowner)
2175                 return 0;
2176
2177         if (ret)
2178                 return ret;
2179
2180         goto retry;
2181 }
2182
2183 static long futex_wait_restart(struct restart_block *restart);
2184
2185 /**
2186  * fixup_owner() - Post lock pi_state and corner case management
2187  * @uaddr:      user address of the futex
2188  * @q:          futex_q (contains pi_state and access to the rt_mutex)
2189  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
2190  *
2191  * After attempting to lock an rt_mutex, this function is called to cleanup
2192  * the pi_state owner as well as handle race conditions that may allow us to
2193  * acquire the lock. Must be called with the hb lock held.
2194  *
2195  * Return:
2196  *  1 - success, lock taken;
2197  *  0 - success, lock not taken;
2198  * <0 - on error (-EFAULT)
2199  */
2200 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2201 {
2202         struct task_struct *owner;
2203         int ret = 0;
2204
2205         if (locked) {
2206                 /*
2207                  * Got the lock. We might not be the anticipated owner if we
2208                  * did a lock-steal - fix up the PI-state in that case:
2209                  */
2210                 if (q->pi_state->owner != current)
2211                         ret = fixup_pi_state_owner(uaddr, q, current);
2212                 goto out;
2213         }
2214
2215         /*
2216          * Catch the rare case, where the lock was released when we were on the
2217          * way back before we locked the hash bucket.
2218          */
2219         if (q->pi_state->owner == current) {
2220                 /*
2221                  * Try to get the rt_mutex now. This might fail as some other
2222                  * task acquired the rt_mutex after we removed ourself from the
2223                  * rt_mutex waiters list.
2224                  */
2225                 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
2226                         locked = 1;
2227                         goto out;
2228                 }
2229
2230                 /*
2231                  * pi_state is incorrect, some other task did a lock steal and
2232                  * we returned due to timeout or signal without taking the
2233                  * rt_mutex. Too late.
2234                  */
2235                 raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
2236                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
2237                 if (!owner)
2238                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
2239                 raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
2240                 ret = fixup_pi_state_owner(uaddr, q, owner);
2241                 goto out;
2242         }
2243
2244         /*
2245          * Paranoia check. If we did not take the lock, then we should not be
2246          * the owner of the rt_mutex.
2247          */
2248         if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
2249                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
2250                                 "pi-state %p\n", ret,
2251                                 q->pi_state->pi_mutex.owner,
2252                                 q->pi_state->owner);
2253
2254 out:
2255         return ret ? ret : locked;
2256 }
2257
2258 /**
2259  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2260  * @hb:         the futex hash bucket, must be locked by the caller
2261  * @q:          the futex_q to queue up on
2262  * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
2263  */
2264 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2265                                 struct hrtimer_sleeper *timeout)
2266 {
2267         /*
2268          * The task state is guaranteed to be set before another task can
2269          * wake it. set_current_state() is implemented using smp_store_mb() and
2270          * queue_me() calls spin_unlock() upon completion, both serializing
2271          * access to the hash list and forcing another memory barrier.
2272          */
2273         set_current_state(TASK_INTERRUPTIBLE);
2274         queue_me(q, hb);
2275
2276         /* Arm the timer */
2277         if (timeout)
2278                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
2279
2280         /*
2281          * If we have been removed from the hash list, then another task
2282          * has tried to wake us, and we can skip the call to schedule().
2283          */
2284         if (likely(!plist_node_empty(&q->list))) {
2285                 /*
2286                  * If the timer has already expired, current will already be
2287                  * flagged for rescheduling. Only call schedule if there
2288                  * is no timeout, or if it has yet to expire.
2289                  */
2290                 if (!timeout || timeout->task)
2291                         freezable_schedule();
2292         }
2293         __set_current_state(TASK_RUNNING);
2294 }
2295
2296 /**
2297  * futex_wait_setup() - Prepare to wait on a futex
2298  * @uaddr:      the futex userspace address
2299  * @val:        the expected value
2300  * @flags:      futex flags (FLAGS_SHARED, etc.)
2301  * @q:          the associated futex_q
2302  * @hb:         storage for hash_bucket pointer to be returned to caller
2303  *
2304  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
2305  * compare it with the expected value.  Handle atomic faults internally.
2306  * Return with the hb lock held and a q.key reference on success, and unlocked
2307  * with no q.key reference on failure.
2308  *
2309  * Return:
2310  *  0 - uaddr contains val and hb has been locked;
2311  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2312  */
2313 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2314                            struct futex_q *q, struct futex_hash_bucket **hb)
2315 {
2316         u32 uval;
2317         int ret;
2318
2319         /*
2320          * Access the page AFTER the hash-bucket is locked.
2321          * Order is important:
2322          *
2323          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2324          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
2325          *
2326          * The basic logical guarantee of a futex is that it blocks ONLY
2327          * if cond(var) is known to be true at the time of blocking, for
2328          * any cond.  If we locked the hash-bucket after testing *uaddr, that
2329          * would open a race condition where we could block indefinitely with
2330          * cond(var) false, which would violate the guarantee.
2331          *
2332          * On the other hand, we insert q and release the hash-bucket only
2333          * after testing *uaddr.  This guarantees that futex_wait() will NOT
2334          * absorb a wakeup if *uaddr does not match the desired values
2335          * while the syscall executes.
2336          */
2337 retry:
2338         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
2339         if (unlikely(ret != 0))
2340                 return ret;
2341
2342 retry_private:
2343         *hb = queue_lock(q);
2344
2345         ret = get_futex_value_locked(&uval, uaddr);
2346
2347         if (ret) {
2348                 queue_unlock(*hb);
2349
2350                 ret = get_user(uval, uaddr);
2351                 if (ret)
2352                         goto out;
2353
2354                 if (!(flags & FLAGS_SHARED))
2355                         goto retry_private;
2356
2357                 put_futex_key(&q->key);
2358                 goto retry;
2359         }
2360
2361         if (uval != val) {
2362                 queue_unlock(*hb);
2363                 ret = -EWOULDBLOCK;
2364         }
2365
2366 out:
2367         if (ret)
2368                 put_futex_key(&q->key);
2369         return ret;
2370 }
2371
2372 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2373                       ktime_t *abs_time, u32 bitset)
2374 {
2375         struct hrtimer_sleeper timeout, *to = NULL;
2376         struct restart_block *restart;
2377         struct futex_hash_bucket *hb;
2378         struct futex_q q = futex_q_init;
2379         int ret;
2380
2381         if (!bitset)
2382                 return -EINVAL;
2383         q.bitset = bitset;
2384
2385         if (abs_time) {
2386                 to = &timeout;
2387
2388                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2389                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2390                                       HRTIMER_MODE_ABS);
2391                 hrtimer_init_sleeper(to, current);
2392                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2393                                              current->timer_slack_ns);
2394         }
2395
2396 retry:
2397         /*
2398          * Prepare to wait on uaddr. On success, holds hb lock and increments
2399          * q.key refs.
2400          */
2401         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2402         if (ret)
2403                 goto out;
2404
2405         /* queue_me and wait for wakeup, timeout, or a signal. */
2406         futex_wait_queue_me(hb, &q, to);
2407
2408         /* If we were woken (and unqueued), we succeeded, whatever. */
2409         ret = 0;
2410         /* unqueue_me() drops q.key ref */
2411         if (!unqueue_me(&q))
2412                 goto out;
2413         ret = -ETIMEDOUT;
2414         if (to && !to->task)
2415                 goto out;
2416
2417         /*
2418          * We expect signal_pending(current), but we might be the
2419          * victim of a spurious wakeup as well.
2420          */
2421         if (!signal_pending(current))
2422                 goto retry;
2423
2424         ret = -ERESTARTSYS;
2425         if (!abs_time)
2426                 goto out;
2427
2428         restart = &current->restart_block;
2429         restart->fn = futex_wait_restart;
2430         restart->futex.uaddr = uaddr;
2431         restart->futex.val = val;
2432         restart->futex.time = abs_time->tv64;
2433         restart->futex.bitset = bitset;
2434         restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2435
2436         ret = -ERESTART_RESTARTBLOCK;
2437
2438 out:
2439         if (to) {
2440                 hrtimer_cancel(&to->timer);
2441                 destroy_hrtimer_on_stack(&to->timer);
2442         }
2443         return ret;
2444 }
2445
2446
2447 static long futex_wait_restart(struct restart_block *restart)
2448 {
2449         u32 __user *uaddr = restart->futex.uaddr;
2450         ktime_t t, *tp = NULL;
2451
2452         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2453                 t.tv64 = restart->futex.time;
2454                 tp = &t;
2455         }
2456         restart->fn = do_no_restart_syscall;
2457
2458         return (long)futex_wait(uaddr, restart->futex.flags,
2459                                 restart->futex.val, tp, restart->futex.bitset);
2460 }
2461
2462
2463 /*
2464  * Userspace tried a 0 -> TID atomic transition of the futex value
2465  * and failed. The kernel side here does the whole locking operation:
2466  * if there are waiters then it will block as a consequence of relying
2467  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
2468  * a 0 value of the futex too.).
2469  *
2470  * Also serves as futex trylock_pi()'ing, and due semantics.
2471  */
2472 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2473                          ktime_t *time, int trylock)
2474 {
2475         struct hrtimer_sleeper timeout, *to = NULL;
2476         struct futex_hash_bucket *hb;
2477         struct futex_q q = futex_q_init;
2478         int res, ret;
2479
2480         if (refill_pi_state_cache())
2481                 return -ENOMEM;
2482
2483         if (time) {
2484                 to = &timeout;
2485                 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2486                                       HRTIMER_MODE_ABS);
2487                 hrtimer_init_sleeper(to, current);
2488                 hrtimer_set_expires(&to->timer, *time);
2489         }
2490
2491 retry:
2492         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2493         if (unlikely(ret != 0))
2494                 goto out;
2495
2496 retry_private:
2497         hb = queue_lock(&q);
2498
2499         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2500         if (unlikely(ret)) {
2501                 /*
2502                  * Atomic work succeeded and we got the lock,
2503                  * or failed. Either way, we do _not_ block.
2504                  */
2505                 switch (ret) {
2506                 case 1:
2507                         /* We got the lock. */
2508                         ret = 0;
2509                         goto out_unlock_put_key;
2510                 case -EFAULT:
2511                         goto uaddr_faulted;
2512                 case -EAGAIN:
2513                         /*
2514                          * Two reasons for this:
2515                          * - Task is exiting and we just wait for the
2516                          *   exit to complete.
2517                          * - The user space value changed.
2518                          */
2519                         queue_unlock(hb);
2520                         put_futex_key(&q.key);
2521                         cond_resched();
2522                         goto retry;
2523                 default:
2524                         goto out_unlock_put_key;
2525                 }
2526         }
2527
2528         /*
2529          * Only actually queue now that the atomic ops are done:
2530          */
2531         queue_me(&q, hb);
2532
2533         WARN_ON(!q.pi_state);
2534         /*
2535          * Block on the PI mutex:
2536          */
2537         if (!trylock) {
2538                 ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
2539         } else {
2540                 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2541                 /* Fixup the trylock return value: */
2542                 ret = ret ? 0 : -EWOULDBLOCK;
2543         }
2544
2545         spin_lock(q.lock_ptr);
2546         /*
2547          * Fixup the pi_state owner and possibly acquire the lock if we
2548          * haven't already.
2549          */
2550         res = fixup_owner(uaddr, &q, !ret);
2551         /*
2552          * If fixup_owner() returned an error, proprogate that.  If it acquired
2553          * the lock, clear our -ETIMEDOUT or -EINTR.
2554          */
2555         if (res)
2556                 ret = (res < 0) ? res : 0;
2557
2558         /*
2559          * If fixup_owner() faulted and was unable to handle the fault, unlock
2560          * it and return the fault to userspace.
2561          */
2562         if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2563                 rt_mutex_unlock(&q.pi_state->pi_mutex);
2564
2565         /* Unqueue and drop the lock */
2566         unqueue_me_pi(&q);
2567
2568         goto out_put_key;
2569
2570 out_unlock_put_key:
2571         queue_unlock(hb);
2572
2573 out_put_key:
2574         put_futex_key(&q.key);
2575 out:
2576         if (to)
2577                 destroy_hrtimer_on_stack(&to->timer);
2578         return ret != -EINTR ? ret : -ERESTARTNOINTR;
2579
2580 uaddr_faulted:
2581         queue_unlock(hb);
2582
2583         ret = fault_in_user_writeable(uaddr);
2584         if (ret)
2585                 goto out_put_key;
2586
2587         if (!(flags & FLAGS_SHARED))
2588                 goto retry_private;
2589
2590         put_futex_key(&q.key);
2591         goto retry;
2592 }
2593
2594 /*
2595  * Userspace attempted a TID -> 0 atomic transition, and failed.
2596  * This is the in-kernel slowpath: we look up the PI state (if any),
2597  * and do the rt-mutex unlock.
2598  */
2599 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2600 {
2601         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
2602         union futex_key key = FUTEX_KEY_INIT;
2603         struct futex_hash_bucket *hb;
2604         struct futex_q *match;
2605         int ret;
2606
2607 retry:
2608         if (get_user(uval, uaddr))
2609                 return -EFAULT;
2610         /*
2611          * We release only a lock we actually own:
2612          */
2613         if ((uval & FUTEX_TID_MASK) != vpid)
2614                 return -EPERM;
2615
2616         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2617         if (ret)
2618                 return ret;
2619
2620         hb = hash_futex(&key);
2621         spin_lock(&hb->lock);
2622
2623         /*
2624          * Check waiters first. We do not trust user space values at
2625          * all and we at least want to know if user space fiddled
2626          * with the futex value instead of blindly unlocking.
2627          */
2628         match = futex_top_waiter(hb, &key);
2629         if (match) {
2630                 ret = wake_futex_pi(uaddr, uval, match, hb);
2631                 /*
2632                  * In case of success wake_futex_pi dropped the hash
2633                  * bucket lock.
2634                  */
2635                 if (!ret)
2636                         goto out_putkey;
2637                 /*
2638                  * The atomic access to the futex value generated a
2639                  * pagefault, so retry the user-access and the wakeup:
2640                  */
2641                 if (ret == -EFAULT)
2642                         goto pi_faulted;
2643                 /*
2644                  * A unconditional UNLOCK_PI op raced against a waiter
2645                  * setting the FUTEX_WAITERS bit. Try again.
2646                  */
2647                 if (ret == -EAGAIN) {
2648                         spin_unlock(&hb->lock);
2649                         put_futex_key(&key);
2650                         goto retry;
2651                 }
2652                 /*
2653                  * wake_futex_pi has detected invalid state. Tell user
2654                  * space.
2655                  */
2656                 goto out_unlock;
2657         }
2658
2659         /*
2660          * We have no kernel internal state, i.e. no waiters in the
2661          * kernel. Waiters which are about to queue themselves are stuck
2662          * on hb->lock. So we can safely ignore them. We do neither
2663          * preserve the WAITERS bit not the OWNER_DIED one. We are the
2664          * owner.
2665          */
2666         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
2667                 goto pi_faulted;
2668
2669         /*
2670          * If uval has changed, let user space handle it.
2671          */
2672         ret = (curval == uval) ? 0 : -EAGAIN;
2673
2674 out_unlock:
2675         spin_unlock(&hb->lock);
2676 out_putkey:
2677         put_futex_key(&key);
2678         return ret;
2679
2680 pi_faulted:
2681         spin_unlock(&hb->lock);
2682         put_futex_key(&key);
2683
2684         ret = fault_in_user_writeable(uaddr);
2685         if (!ret)
2686                 goto retry;
2687
2688         return ret;
2689 }
2690
2691 /**
2692  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2693  * @hb:         the hash_bucket futex_q was original enqueued on
2694  * @q:          the futex_q woken while waiting to be requeued
2695  * @key2:       the futex_key of the requeue target futex
2696  * @timeout:    the timeout associated with the wait (NULL if none)
2697  *
2698  * Detect if the task was woken on the initial futex as opposed to the requeue
2699  * target futex.  If so, determine if it was a timeout or a signal that caused
2700  * the wakeup and return the appropriate error code to the caller.  Must be
2701  * called with the hb lock held.
2702  *
2703  * Return:
2704  *  0 = no early wakeup detected;
2705  * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2706  */
2707 static inline
2708 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2709                                    struct futex_q *q, union futex_key *key2,
2710                                    struct hrtimer_sleeper *timeout)
2711 {
2712         int ret = 0;
2713
2714         /*
2715          * With the hb lock held, we avoid races while we process the wakeup.
2716          * We only need to hold hb (and not hb2) to ensure atomicity as the
2717          * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2718          * It can't be requeued from uaddr2 to something else since we don't
2719          * support a PI aware source futex for requeue.
2720          */
2721         if (!match_futex(&q->key, key2)) {
2722                 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2723                 /*
2724                  * We were woken prior to requeue by a timeout or a signal.
2725                  * Unqueue the futex_q and determine which it was.
2726                  */
2727                 plist_del(&q->list, &hb->chain);
2728                 hb_waiters_dec(hb);
2729
2730                 /* Handle spurious wakeups gracefully */
2731                 ret = -EWOULDBLOCK;
2732                 if (timeout && !timeout->task)
2733                         ret = -ETIMEDOUT;
2734                 else if (signal_pending(current))
2735                         ret = -ERESTARTNOINTR;
2736         }
2737         return ret;
2738 }
2739
2740 /**
2741  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2742  * @uaddr:      the futex we initially wait on (non-pi)
2743  * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2744  *              the same type, no requeueing from private to shared, etc.
2745  * @val:        the expected value of uaddr
2746  * @abs_time:   absolute timeout
2747  * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2748  * @uaddr2:     the pi futex we will take prior to returning to user-space
2749  *
2750  * The caller will wait on uaddr and will be requeued by futex_requeue() to
2751  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
2752  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2753  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
2754  * without one, the pi logic would not know which task to boost/deboost, if
2755  * there was a need to.
2756  *
2757  * We call schedule in futex_wait_queue_me() when we enqueue and return there
2758  * via the following--
2759  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2760  * 2) wakeup on uaddr2 after a requeue
2761  * 3) signal
2762  * 4) timeout
2763  *
2764  * If 3, cleanup and return -ERESTARTNOINTR.
2765  *
2766  * If 2, we may then block on trying to take the rt_mutex and return via:
2767  * 5) successful lock
2768  * 6) signal
2769  * 7) timeout
2770  * 8) other lock acquisition failure
2771  *
2772  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2773  *
2774  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2775  *
2776  * Return:
2777  *  0 - On success;
2778  * <0 - On error
2779  */
2780 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2781                                  u32 val, ktime_t *abs_time, u32 bitset,
2782                                  u32 __user *uaddr2)
2783 {
2784         struct hrtimer_sleeper timeout, *to = NULL;
2785         struct rt_mutex_waiter rt_waiter;
2786         struct rt_mutex *pi_mutex = NULL;
2787         struct futex_hash_bucket *hb;
2788         union futex_key key2 = FUTEX_KEY_INIT;
2789         struct futex_q q = futex_q_init;
2790         int res, ret;
2791
2792         if (uaddr == uaddr2)
2793                 return -EINVAL;
2794
2795         if (!bitset)
2796                 return -EINVAL;
2797
2798         if (abs_time) {
2799                 to = &timeout;
2800                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2801                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2802                                       HRTIMER_MODE_ABS);
2803                 hrtimer_init_sleeper(to, current);
2804                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2805                                              current->timer_slack_ns);
2806         }
2807
2808         /*
2809          * The waiter is allocated on our stack, manipulated by the requeue
2810          * code while we sleep on uaddr.
2811          */
2812         debug_rt_mutex_init_waiter(&rt_waiter);
2813         RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2814         RB_CLEAR_NODE(&rt_waiter.tree_entry);
2815         rt_waiter.task = NULL;
2816
2817         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2818         if (unlikely(ret != 0))
2819                 goto out;
2820
2821         q.bitset = bitset;
2822         q.rt_waiter = &rt_waiter;
2823         q.requeue_pi_key = &key2;
2824
2825         /*
2826          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2827          * count.
2828          */
2829         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2830         if (ret)
2831                 goto out_key2;
2832
2833         /*
2834          * The check above which compares uaddrs is not sufficient for
2835          * shared futexes. We need to compare the keys:
2836          */
2837         if (match_futex(&q.key, &key2)) {
2838                 queue_unlock(hb);
2839                 ret = -EINVAL;
2840                 goto out_put_keys;
2841         }
2842
2843         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2844         futex_wait_queue_me(hb, &q, to);
2845
2846         spin_lock(&hb->lock);
2847         ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2848         spin_unlock(&hb->lock);
2849         if (ret)
2850                 goto out_put_keys;
2851
2852         /*
2853          * In order for us to be here, we know our q.key == key2, and since
2854          * we took the hb->lock above, we also know that futex_requeue() has
2855          * completed and we no longer have to concern ourselves with a wakeup
2856          * race with the atomic proxy lock acquisition by the requeue code. The
2857          * futex_requeue dropped our key1 reference and incremented our key2
2858          * reference count.
2859          */
2860
2861         /* Check if the requeue code acquired the second futex for us. */
2862         if (!q.rt_waiter) {
2863                 /*
2864                  * Got the lock. We might not be the anticipated owner if we
2865                  * did a lock-steal - fix up the PI-state in that case.
2866                  */
2867                 if (q.pi_state && (q.pi_state->owner != current)) {
2868                         spin_lock(q.lock_ptr);
2869                         ret = fixup_pi_state_owner(uaddr2, &q, current);
2870                         /*
2871                          * Drop the reference to the pi state which
2872                          * the requeue_pi() code acquired for us.
2873                          */
2874                         put_pi_state(q.pi_state);
2875                         spin_unlock(q.lock_ptr);
2876                 }
2877         } else {
2878                 /*
2879                  * We have been woken up by futex_unlock_pi(), a timeout, or a
2880                  * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2881                  * the pi_state.
2882                  */
2883                 WARN_ON(!q.pi_state);
2884                 pi_mutex = &q.pi_state->pi_mutex;
2885                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
2886                 debug_rt_mutex_free_waiter(&rt_waiter);
2887
2888                 spin_lock(q.lock_ptr);
2889                 /*
2890                  * Fixup the pi_state owner and possibly acquire the lock if we
2891                  * haven't already.
2892                  */
2893                 res = fixup_owner(uaddr2, &q, !ret);
2894                 /*
2895                  * If fixup_owner() returned an error, proprogate that.  If it
2896                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
2897                  */
2898                 if (res)
2899                         ret = (res < 0) ? res : 0;
2900
2901                 /* Unqueue and drop the lock. */
2902                 unqueue_me_pi(&q);
2903         }
2904
2905         /*
2906          * If fixup_pi_state_owner() faulted and was unable to handle the
2907          * fault, unlock the rt_mutex and return the fault to userspace.
2908          */
2909         if (ret == -EFAULT) {
2910                 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2911                         rt_mutex_unlock(pi_mutex);
2912         } else if (ret == -EINTR) {
2913                 /*
2914                  * We've already been requeued, but cannot restart by calling
2915                  * futex_lock_pi() directly. We could restart this syscall, but
2916                  * it would detect that the user space "val" changed and return
2917                  * -EWOULDBLOCK.  Save the overhead of the restart and return
2918                  * -EWOULDBLOCK directly.
2919                  */
2920                 ret = -EWOULDBLOCK;
2921         }
2922
2923 out_put_keys:
2924         put_futex_key(&q.key);
2925 out_key2:
2926         put_futex_key(&key2);
2927
2928 out:
2929         if (to) {
2930                 hrtimer_cancel(&to->timer);
2931                 destroy_hrtimer_on_stack(&to->timer);
2932         }
2933         return ret;
2934 }
2935
2936 /*
2937  * Support for robust futexes: the kernel cleans up held futexes at
2938  * thread exit time.
2939  *
2940  * Implementation: user-space maintains a per-thread list of locks it
2941  * is holding. Upon do_exit(), the kernel carefully walks this list,
2942  * and marks all locks that are owned by this thread with the
2943  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2944  * always manipulated with the lock held, so the list is private and
2945  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2946  * field, to allow the kernel to clean up if the thread dies after
2947  * acquiring the lock, but just before it could have added itself to
2948  * the list. There can only be one such pending lock.
2949  */
2950
2951 /**
2952  * sys_set_robust_list() - Set the robust-futex list head of a task
2953  * @head:       pointer to the list-head
2954  * @len:        length of the list-head, as userspace expects
2955  */
2956 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2957                 size_t, len)
2958 {
2959         if (!futex_cmpxchg_enabled)
2960                 return -ENOSYS;
2961         /*
2962          * The kernel knows only one size for now:
2963          */
2964         if (unlikely(len != sizeof(*head)))
2965                 return -EINVAL;
2966
2967         current->robust_list = head;
2968
2969         return 0;
2970 }
2971
2972 /**
2973  * sys_get_robust_list() - Get the robust-futex list head of a task
2974  * @pid:        pid of the process [zero for current task]
2975  * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2976  * @len_ptr:    pointer to a length field, the kernel fills in the header size
2977  */
2978 SYSCALL_DEFINE3(get_robust_list, int, pid,
2979                 struct robust_list_head __user * __user *, head_ptr,
2980                 size_t __user *, len_ptr)
2981 {
2982         struct robust_list_head __user *head;
2983         unsigned long ret;
2984         struct task_struct *p;
2985
2986         if (!futex_cmpxchg_enabled)
2987                 return -ENOSYS;
2988
2989         rcu_read_lock();
2990
2991         ret = -ESRCH;
2992         if (!pid)
2993                 p = current;
2994         else {
2995                 p = find_task_by_vpid(pid);
2996                 if (!p)
2997                         goto err_unlock;
2998         }
2999
3000         ret = -EPERM;
3001         if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3002                 goto err_unlock;
3003
3004         head = p->robust_list;
3005         rcu_read_unlock();
3006
3007         if (put_user(sizeof(*head), len_ptr))
3008                 return -EFAULT;
3009         return put_user(head, head_ptr);
3010
3011 err_unlock:
3012         rcu_read_unlock();
3013
3014         return ret;
3015 }
3016
3017 /*
3018  * Process a futex-list entry, check whether it's owned by the
3019  * dying task, and do notification if so:
3020  */
3021 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
3022 {
3023         u32 uval, uninitialized_var(nval), mval;
3024
3025 retry:
3026         if (get_user(uval, uaddr))
3027                 return -1;
3028
3029         if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
3030                 /*
3031                  * Ok, this dying thread is truly holding a futex
3032                  * of interest. Set the OWNER_DIED bit atomically
3033                  * via cmpxchg, and if the value had FUTEX_WAITERS
3034                  * set, wake up a waiter (if any). (We have to do a
3035                  * futex_wake() even if OWNER_DIED is already set -
3036                  * to handle the rare but possible case of recursive
3037                  * thread-death.) The rest of the cleanup is done in
3038                  * userspace.
3039                  */
3040                 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3041                 /*
3042                  * We are not holding a lock here, but we want to have
3043                  * the pagefault_disable/enable() protection because
3044                  * we want to handle the fault gracefully. If the
3045                  * access fails we try to fault in the futex with R/W
3046                  * verification via get_user_pages. get_user() above
3047                  * does not guarantee R/W access. If that fails we
3048                  * give up and leave the futex locked.
3049                  */
3050                 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
3051                         if (fault_in_user_writeable(uaddr))
3052                                 return -1;
3053                         goto retry;
3054                 }
3055                 if (nval != uval)
3056                         goto retry;
3057
3058                 /*
3059                  * Wake robust non-PI futexes here. The wakeup of
3060                  * PI futexes happens in exit_pi_state():
3061                  */
3062                 if (!pi && (uval & FUTEX_WAITERS))
3063                         futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3064         }
3065         return 0;
3066 }
3067
3068 /*
3069  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3070  */
3071 static inline int fetch_robust_entry(struct robust_list __user **entry,
3072                                      struct robust_list __user * __user *head,
3073                                      unsigned int *pi)
3074 {
3075         unsigned long uentry;
3076
3077         if (get_user(uentry, (unsigned long __user *)head))
3078                 return -EFAULT;
3079
3080         *entry = (void __user *)(uentry & ~1UL);
3081         *pi = uentry & 1;
3082
3083         return 0;
3084 }
3085
3086 /*
3087  * Walk curr->robust_list (very carefully, it's a userspace list!)
3088  * and mark any locks found there dead, and notify any waiters.
3089  *
3090  * We silently return on any sign of list-walking problem.
3091  */
3092 void exit_robust_list(struct task_struct *curr)
3093 {
3094         struct robust_list_head __user *head = curr->robust_list;
3095         struct robust_list __user *entry, *next_entry, *pending;
3096         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3097         unsigned int uninitialized_var(next_pi);
3098         unsigned long futex_offset;
3099         int rc;
3100
3101         if (!futex_cmpxchg_enabled)
3102                 return;
3103
3104         /*
3105          * Fetch the list head (which was registered earlier, via
3106          * sys_set_robust_list()):
3107          */
3108         if (fetch_robust_entry(&entry, &head->list.next, &pi))
3109                 return;
3110         /*
3111          * Fetch the relative futex offset:
3112          */
3113         if (get_user(futex_offset, &head->futex_offset))
3114                 return;
3115         /*
3116          * Fetch any possibly pending lock-add first, and handle it
3117          * if it exists:
3118          */
3119         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3120                 return;
3121
3122         next_entry = NULL;      /* avoid warning with gcc */
3123         while (entry != &head->list) {
3124                 /*
3125                  * Fetch the next entry in the list before calling
3126                  * handle_futex_death:
3127                  */
3128                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3129                 /*
3130                  * A pending lock might already be on the list, so
3131                  * don't process it twice:
3132                  */
3133                 if (entry != pending)
3134                         if (handle_futex_death((void __user *)entry + futex_offset,
3135                                                 curr, pi))
3136                                 return;
3137                 if (rc)
3138                         return;
3139                 entry = next_entry;
3140                 pi = next_pi;
3141                 /*
3142                  * Avoid excessively long or circular lists:
3143                  */
3144                 if (!--limit)
3145                         break;
3146
3147                 cond_resched();
3148         }
3149
3150         if (pending)
3151                 handle_futex_death((void __user *)pending + futex_offset,
3152                                    curr, pip);
3153 }
3154
3155 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3156                 u32 __user *uaddr2, u32 val2, u32 val3)
3157 {
3158         int cmd = op & FUTEX_CMD_MASK;
3159         unsigned int flags = 0;
3160
3161         if (!(op & FUTEX_PRIVATE_FLAG))
3162                 flags |= FLAGS_SHARED;
3163
3164         if (op & FUTEX_CLOCK_REALTIME) {
3165                 flags |= FLAGS_CLOCKRT;
3166                 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
3167                     cmd != FUTEX_WAIT_REQUEUE_PI)
3168                         return -ENOSYS;
3169         }
3170
3171         switch (cmd) {
3172         case FUTEX_LOCK_PI:
3173         case FUTEX_UNLOCK_PI:
3174         case FUTEX_TRYLOCK_PI:
3175         case FUTEX_WAIT_REQUEUE_PI:
3176         case FUTEX_CMP_REQUEUE_PI:
3177                 if (!futex_cmpxchg_enabled)
3178                         return -ENOSYS;
3179         }
3180
3181         switch (cmd) {
3182         case FUTEX_WAIT:
3183                 val3 = FUTEX_BITSET_MATCH_ANY;
3184         case FUTEX_WAIT_BITSET:
3185                 return futex_wait(uaddr, flags, val, timeout, val3);
3186         case FUTEX_WAKE:
3187                 val3 = FUTEX_BITSET_MATCH_ANY;
3188         case FUTEX_WAKE_BITSET:
3189                 return futex_wake(uaddr, flags, val, val3);
3190         case FUTEX_REQUEUE:
3191                 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
3192         case FUTEX_CMP_REQUEUE:
3193                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
3194         case FUTEX_WAKE_OP:
3195                 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
3196         case FUTEX_LOCK_PI:
3197                 return futex_lock_pi(uaddr, flags, timeout, 0);
3198         case FUTEX_UNLOCK_PI:
3199                 return futex_unlock_pi(uaddr, flags);
3200         case FUTEX_TRYLOCK_PI:
3201                 return futex_lock_pi(uaddr, flags, NULL, 1);
3202         case FUTEX_WAIT_REQUEUE_PI:
3203                 val3 = FUTEX_BITSET_MATCH_ANY;
3204                 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
3205                                              uaddr2);
3206         case FUTEX_CMP_REQUEUE_PI:
3207                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
3208         }
3209         return -ENOSYS;
3210 }
3211
3212
3213 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3214                 struct timespec __user *, utime, u32 __user *, uaddr2,
3215                 u32, val3)
3216 {
3217         struct timespec ts;
3218         ktime_t t, *tp = NULL;
3219         u32 val2 = 0;
3220         int cmd = op & FUTEX_CMD_MASK;
3221
3222         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3223                       cmd == FUTEX_WAIT_BITSET ||
3224                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
3225                 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
3226                         return -EFAULT;
3227                 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
3228                         return -EFAULT;
3229                 if (!timespec_valid(&ts))
3230                         return -EINVAL;
3231
3232                 t = timespec_to_ktime(ts);
3233                 if (cmd == FUTEX_WAIT)
3234                         t = ktime_add_safe(ktime_get(), t);
3235                 tp = &t;
3236         }
3237         /*
3238          * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
3239          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
3240          */
3241         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3242             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3243                 val2 = (u32) (unsigned long) utime;
3244
3245         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3246 }
3247
3248 static void __init futex_detect_cmpxchg(void)
3249 {
3250 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
3251         u32 curval;
3252
3253         /*
3254          * This will fail and we want it. Some arch implementations do
3255          * runtime detection of the futex_atomic_cmpxchg_inatomic()
3256          * functionality. We want to know that before we call in any
3257          * of the complex code paths. Also we want to prevent
3258          * registration of robust lists in that case. NULL is
3259          * guaranteed to fault and we get -EFAULT on functional
3260          * implementation, the non-functional ones will return
3261          * -ENOSYS.
3262          */
3263         if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
3264                 futex_cmpxchg_enabled = 1;
3265 #endif
3266 }
3267
3268 static int __init futex_init(void)
3269 {
3270         unsigned int futex_shift;
3271         unsigned long i;
3272
3273 #if CONFIG_BASE_SMALL
3274         futex_hashsize = 16;
3275 #else
3276         futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
3277 #endif
3278
3279         futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
3280                                                futex_hashsize, 0,
3281                                                futex_hashsize < 256 ? HASH_SMALL : 0,
3282                                                &futex_shift, NULL,
3283                                                futex_hashsize, futex_hashsize);
3284         futex_hashsize = 1UL << futex_shift;
3285
3286         futex_detect_cmpxchg();
3287
3288         for (i = 0; i < futex_hashsize; i++) {
3289                 atomic_set(&futex_queues[i].waiters, 0);
3290                 plist_head_init(&futex_queues[i].chain);
3291                 spin_lock_init(&futex_queues[i].lock);
3292         }
3293
3294         return 0;
3295 }
3296 __initcall(futex_init);