Merge tag 'for-v3.8' of git://git.infradead.org/users/cbou/linux-pstore
[cascardo/linux.git] / kernel / posix-cpu-timers.c
1 /*
2  * Implement CPU time clocks for the POSIX clock interface.
3  */
4
5 #include <linux/sched.h>
6 #include <linux/posix-timers.h>
7 #include <linux/errno.h>
8 #include <linux/math64.h>
9 #include <asm/uaccess.h>
10 #include <linux/kernel_stat.h>
11 #include <trace/events/timer.h>
12
13 /*
14  * Called after updating RLIMIT_CPU to run cpu timer and update
15  * tsk->signal->cputime_expires expiration cache if necessary. Needs
16  * siglock protection since other code may update expiration cache as
17  * well.
18  */
19 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
20 {
21         cputime_t cputime = secs_to_cputime(rlim_new);
22
23         spin_lock_irq(&task->sighand->siglock);
24         set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
25         spin_unlock_irq(&task->sighand->siglock);
26 }
27
28 static int check_clock(const clockid_t which_clock)
29 {
30         int error = 0;
31         struct task_struct *p;
32         const pid_t pid = CPUCLOCK_PID(which_clock);
33
34         if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
35                 return -EINVAL;
36
37         if (pid == 0)
38                 return 0;
39
40         rcu_read_lock();
41         p = find_task_by_vpid(pid);
42         if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43                    same_thread_group(p, current) : has_group_leader_pid(p))) {
44                 error = -EINVAL;
45         }
46         rcu_read_unlock();
47
48         return error;
49 }
50
51 static inline union cpu_time_count
52 timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
53 {
54         union cpu_time_count ret;
55         ret.sched = 0;          /* high half always zero when .cpu used */
56         if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
57                 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
58         } else {
59                 ret.cpu = timespec_to_cputime(tp);
60         }
61         return ret;
62 }
63
64 static void sample_to_timespec(const clockid_t which_clock,
65                                union cpu_time_count cpu,
66                                struct timespec *tp)
67 {
68         if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
69                 *tp = ns_to_timespec(cpu.sched);
70         else
71                 cputime_to_timespec(cpu.cpu, tp);
72 }
73
74 static inline int cpu_time_before(const clockid_t which_clock,
75                                   union cpu_time_count now,
76                                   union cpu_time_count then)
77 {
78         if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79                 return now.sched < then.sched;
80         }  else {
81                 return now.cpu < then.cpu;
82         }
83 }
84 static inline void cpu_time_add(const clockid_t which_clock,
85                                 union cpu_time_count *acc,
86                                 union cpu_time_count val)
87 {
88         if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89                 acc->sched += val.sched;
90         }  else {
91                 acc->cpu += val.cpu;
92         }
93 }
94 static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
95                                                 union cpu_time_count a,
96                                                 union cpu_time_count b)
97 {
98         if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99                 a.sched -= b.sched;
100         }  else {
101                 a.cpu -= b.cpu;
102         }
103         return a;
104 }
105
106 /*
107  * Update expiry time from increment, and increase overrun count,
108  * given the current clock sample.
109  */
110 static void bump_cpu_timer(struct k_itimer *timer,
111                                   union cpu_time_count now)
112 {
113         int i;
114
115         if (timer->it.cpu.incr.sched == 0)
116                 return;
117
118         if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
119                 unsigned long long delta, incr;
120
121                 if (now.sched < timer->it.cpu.expires.sched)
122                         return;
123                 incr = timer->it.cpu.incr.sched;
124                 delta = now.sched + incr - timer->it.cpu.expires.sched;
125                 /* Don't use (incr*2 < delta), incr*2 might overflow. */
126                 for (i = 0; incr < delta - incr; i++)
127                         incr = incr << 1;
128                 for (; i >= 0; incr >>= 1, i--) {
129                         if (delta < incr)
130                                 continue;
131                         timer->it.cpu.expires.sched += incr;
132                         timer->it_overrun += 1 << i;
133                         delta -= incr;
134                 }
135         } else {
136                 cputime_t delta, incr;
137
138                 if (now.cpu < timer->it.cpu.expires.cpu)
139                         return;
140                 incr = timer->it.cpu.incr.cpu;
141                 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
142                 /* Don't use (incr*2 < delta), incr*2 might overflow. */
143                 for (i = 0; incr < delta - incr; i++)
144                              incr += incr;
145                 for (; i >= 0; incr = incr >> 1, i--) {
146                         if (delta < incr)
147                                 continue;
148                         timer->it.cpu.expires.cpu += incr;
149                         timer->it_overrun += 1 << i;
150                         delta -= incr;
151                 }
152         }
153 }
154
155 static inline cputime_t prof_ticks(struct task_struct *p)
156 {
157         return p->utime + p->stime;
158 }
159 static inline cputime_t virt_ticks(struct task_struct *p)
160 {
161         return p->utime;
162 }
163
164 static int
165 posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
166 {
167         int error = check_clock(which_clock);
168         if (!error) {
169                 tp->tv_sec = 0;
170                 tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
171                 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
172                         /*
173                          * If sched_clock is using a cycle counter, we
174                          * don't have any idea of its true resolution
175                          * exported, but it is much more than 1s/HZ.
176                          */
177                         tp->tv_nsec = 1;
178                 }
179         }
180         return error;
181 }
182
183 static int
184 posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
185 {
186         /*
187          * You can never reset a CPU clock, but we check for other errors
188          * in the call before failing with EPERM.
189          */
190         int error = check_clock(which_clock);
191         if (error == 0) {
192                 error = -EPERM;
193         }
194         return error;
195 }
196
197
198 /*
199  * Sample a per-thread clock for the given task.
200  */
201 static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
202                             union cpu_time_count *cpu)
203 {
204         switch (CPUCLOCK_WHICH(which_clock)) {
205         default:
206                 return -EINVAL;
207         case CPUCLOCK_PROF:
208                 cpu->cpu = prof_ticks(p);
209                 break;
210         case CPUCLOCK_VIRT:
211                 cpu->cpu = virt_ticks(p);
212                 break;
213         case CPUCLOCK_SCHED:
214                 cpu->sched = task_sched_runtime(p);
215                 break;
216         }
217         return 0;
218 }
219
220 static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
221 {
222         if (b->utime > a->utime)
223                 a->utime = b->utime;
224
225         if (b->stime > a->stime)
226                 a->stime = b->stime;
227
228         if (b->sum_exec_runtime > a->sum_exec_runtime)
229                 a->sum_exec_runtime = b->sum_exec_runtime;
230 }
231
232 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
233 {
234         struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
235         struct task_cputime sum;
236         unsigned long flags;
237
238         if (!cputimer->running) {
239                 /*
240                  * The POSIX timer interface allows for absolute time expiry
241                  * values through the TIMER_ABSTIME flag, therefore we have
242                  * to synchronize the timer to the clock every time we start
243                  * it.
244                  */
245                 thread_group_cputime(tsk, &sum);
246                 raw_spin_lock_irqsave(&cputimer->lock, flags);
247                 cputimer->running = 1;
248                 update_gt_cputime(&cputimer->cputime, &sum);
249         } else
250                 raw_spin_lock_irqsave(&cputimer->lock, flags);
251         *times = cputimer->cputime;
252         raw_spin_unlock_irqrestore(&cputimer->lock, flags);
253 }
254
255 /*
256  * Sample a process (thread group) clock for the given group_leader task.
257  * Must be called with tasklist_lock held for reading.
258  */
259 static int cpu_clock_sample_group(const clockid_t which_clock,
260                                   struct task_struct *p,
261                                   union cpu_time_count *cpu)
262 {
263         struct task_cputime cputime;
264
265         switch (CPUCLOCK_WHICH(which_clock)) {
266         default:
267                 return -EINVAL;
268         case CPUCLOCK_PROF:
269                 thread_group_cputime(p, &cputime);
270                 cpu->cpu = cputime.utime + cputime.stime;
271                 break;
272         case CPUCLOCK_VIRT:
273                 thread_group_cputime(p, &cputime);
274                 cpu->cpu = cputime.utime;
275                 break;
276         case CPUCLOCK_SCHED:
277                 thread_group_cputime(p, &cputime);
278                 cpu->sched = cputime.sum_exec_runtime;
279                 break;
280         }
281         return 0;
282 }
283
284
285 static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
286 {
287         const pid_t pid = CPUCLOCK_PID(which_clock);
288         int error = -EINVAL;
289         union cpu_time_count rtn;
290
291         if (pid == 0) {
292                 /*
293                  * Special case constant value for our own clocks.
294                  * We don't have to do any lookup to find ourselves.
295                  */
296                 if (CPUCLOCK_PERTHREAD(which_clock)) {
297                         /*
298                          * Sampling just ourselves we can do with no locking.
299                          */
300                         error = cpu_clock_sample(which_clock,
301                                                  current, &rtn);
302                 } else {
303                         read_lock(&tasklist_lock);
304                         error = cpu_clock_sample_group(which_clock,
305                                                        current, &rtn);
306                         read_unlock(&tasklist_lock);
307                 }
308         } else {
309                 /*
310                  * Find the given PID, and validate that the caller
311                  * should be able to see it.
312                  */
313                 struct task_struct *p;
314                 rcu_read_lock();
315                 p = find_task_by_vpid(pid);
316                 if (p) {
317                         if (CPUCLOCK_PERTHREAD(which_clock)) {
318                                 if (same_thread_group(p, current)) {
319                                         error = cpu_clock_sample(which_clock,
320                                                                  p, &rtn);
321                                 }
322                         } else {
323                                 read_lock(&tasklist_lock);
324                                 if (thread_group_leader(p) && p->sighand) {
325                                         error =
326                                             cpu_clock_sample_group(which_clock,
327                                                                    p, &rtn);
328                                 }
329                                 read_unlock(&tasklist_lock);
330                         }
331                 }
332                 rcu_read_unlock();
333         }
334
335         if (error)
336                 return error;
337         sample_to_timespec(which_clock, rtn, tp);
338         return 0;
339 }
340
341
342 /*
343  * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
344  * This is called from sys_timer_create() and do_cpu_nanosleep() with the
345  * new timer already all-zeros initialized.
346  */
347 static int posix_cpu_timer_create(struct k_itimer *new_timer)
348 {
349         int ret = 0;
350         const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
351         struct task_struct *p;
352
353         if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
354                 return -EINVAL;
355
356         INIT_LIST_HEAD(&new_timer->it.cpu.entry);
357
358         rcu_read_lock();
359         if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
360                 if (pid == 0) {
361                         p = current;
362                 } else {
363                         p = find_task_by_vpid(pid);
364                         if (p && !same_thread_group(p, current))
365                                 p = NULL;
366                 }
367         } else {
368                 if (pid == 0) {
369                         p = current->group_leader;
370                 } else {
371                         p = find_task_by_vpid(pid);
372                         if (p && !has_group_leader_pid(p))
373                                 p = NULL;
374                 }
375         }
376         new_timer->it.cpu.task = p;
377         if (p) {
378                 get_task_struct(p);
379         } else {
380                 ret = -EINVAL;
381         }
382         rcu_read_unlock();
383
384         return ret;
385 }
386
387 /*
388  * Clean up a CPU-clock timer that is about to be destroyed.
389  * This is called from timer deletion with the timer already locked.
390  * If we return TIMER_RETRY, it's necessary to release the timer's lock
391  * and try again.  (This happens when the timer is in the middle of firing.)
392  */
393 static int posix_cpu_timer_del(struct k_itimer *timer)
394 {
395         struct task_struct *p = timer->it.cpu.task;
396         int ret = 0;
397
398         if (likely(p != NULL)) {
399                 read_lock(&tasklist_lock);
400                 if (unlikely(p->sighand == NULL)) {
401                         /*
402                          * We raced with the reaping of the task.
403                          * The deletion should have cleared us off the list.
404                          */
405                         BUG_ON(!list_empty(&timer->it.cpu.entry));
406                 } else {
407                         spin_lock(&p->sighand->siglock);
408                         if (timer->it.cpu.firing)
409                                 ret = TIMER_RETRY;
410                         else
411                                 list_del(&timer->it.cpu.entry);
412                         spin_unlock(&p->sighand->siglock);
413                 }
414                 read_unlock(&tasklist_lock);
415
416                 if (!ret)
417                         put_task_struct(p);
418         }
419
420         return ret;
421 }
422
423 /*
424  * Clean out CPU timers still ticking when a thread exited.  The task
425  * pointer is cleared, and the expiry time is replaced with the residual
426  * time for later timer_gettime calls to return.
427  * This must be called with the siglock held.
428  */
429 static void cleanup_timers(struct list_head *head,
430                            cputime_t utime, cputime_t stime,
431                            unsigned long long sum_exec_runtime)
432 {
433         struct cpu_timer_list *timer, *next;
434         cputime_t ptime = utime + stime;
435
436         list_for_each_entry_safe(timer, next, head, entry) {
437                 list_del_init(&timer->entry);
438                 if (timer->expires.cpu < ptime) {
439                         timer->expires.cpu = 0;
440                 } else {
441                         timer->expires.cpu -= ptime;
442                 }
443         }
444
445         ++head;
446         list_for_each_entry_safe(timer, next, head, entry) {
447                 list_del_init(&timer->entry);
448                 if (timer->expires.cpu < utime) {
449                         timer->expires.cpu = 0;
450                 } else {
451                         timer->expires.cpu -= utime;
452                 }
453         }
454
455         ++head;
456         list_for_each_entry_safe(timer, next, head, entry) {
457                 list_del_init(&timer->entry);
458                 if (timer->expires.sched < sum_exec_runtime) {
459                         timer->expires.sched = 0;
460                 } else {
461                         timer->expires.sched -= sum_exec_runtime;
462                 }
463         }
464 }
465
466 /*
467  * These are both called with the siglock held, when the current thread
468  * is being reaped.  When the final (leader) thread in the group is reaped,
469  * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
470  */
471 void posix_cpu_timers_exit(struct task_struct *tsk)
472 {
473         cleanup_timers(tsk->cpu_timers,
474                        tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
475
476 }
477 void posix_cpu_timers_exit_group(struct task_struct *tsk)
478 {
479         struct signal_struct *const sig = tsk->signal;
480
481         cleanup_timers(tsk->signal->cpu_timers,
482                        tsk->utime + sig->utime, tsk->stime + sig->stime,
483                        tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
484 }
485
486 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
487 {
488         /*
489          * That's all for this thread or process.
490          * We leave our residual in expires to be reported.
491          */
492         put_task_struct(timer->it.cpu.task);
493         timer->it.cpu.task = NULL;
494         timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
495                                              timer->it.cpu.expires,
496                                              now);
497 }
498
499 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
500 {
501         return expires == 0 || expires > new_exp;
502 }
503
504 /*
505  * Insert the timer on the appropriate list before any timers that
506  * expire later.  This must be called with the tasklist_lock held
507  * for reading, interrupts disabled and p->sighand->siglock taken.
508  */
509 static void arm_timer(struct k_itimer *timer)
510 {
511         struct task_struct *p = timer->it.cpu.task;
512         struct list_head *head, *listpos;
513         struct task_cputime *cputime_expires;
514         struct cpu_timer_list *const nt = &timer->it.cpu;
515         struct cpu_timer_list *next;
516
517         if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
518                 head = p->cpu_timers;
519                 cputime_expires = &p->cputime_expires;
520         } else {
521                 head = p->signal->cpu_timers;
522                 cputime_expires = &p->signal->cputime_expires;
523         }
524         head += CPUCLOCK_WHICH(timer->it_clock);
525
526         listpos = head;
527         list_for_each_entry(next, head, entry) {
528                 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
529                         break;
530                 listpos = &next->entry;
531         }
532         list_add(&nt->entry, listpos);
533
534         if (listpos == head) {
535                 union cpu_time_count *exp = &nt->expires;
536
537                 /*
538                  * We are the new earliest-expiring POSIX 1.b timer, hence
539                  * need to update expiration cache. Take into account that
540                  * for process timers we share expiration cache with itimers
541                  * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
542                  */
543
544                 switch (CPUCLOCK_WHICH(timer->it_clock)) {
545                 case CPUCLOCK_PROF:
546                         if (expires_gt(cputime_expires->prof_exp, exp->cpu))
547                                 cputime_expires->prof_exp = exp->cpu;
548                         break;
549                 case CPUCLOCK_VIRT:
550                         if (expires_gt(cputime_expires->virt_exp, exp->cpu))
551                                 cputime_expires->virt_exp = exp->cpu;
552                         break;
553                 case CPUCLOCK_SCHED:
554                         if (cputime_expires->sched_exp == 0 ||
555                             cputime_expires->sched_exp > exp->sched)
556                                 cputime_expires->sched_exp = exp->sched;
557                         break;
558                 }
559         }
560 }
561
562 /*
563  * The timer is locked, fire it and arrange for its reload.
564  */
565 static void cpu_timer_fire(struct k_itimer *timer)
566 {
567         if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
568                 /*
569                  * User don't want any signal.
570                  */
571                 timer->it.cpu.expires.sched = 0;
572         } else if (unlikely(timer->sigq == NULL)) {
573                 /*
574                  * This a special case for clock_nanosleep,
575                  * not a normal timer from sys_timer_create.
576                  */
577                 wake_up_process(timer->it_process);
578                 timer->it.cpu.expires.sched = 0;
579         } else if (timer->it.cpu.incr.sched == 0) {
580                 /*
581                  * One-shot timer.  Clear it as soon as it's fired.
582                  */
583                 posix_timer_event(timer, 0);
584                 timer->it.cpu.expires.sched = 0;
585         } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
586                 /*
587                  * The signal did not get queued because the signal
588                  * was ignored, so we won't get any callback to
589                  * reload the timer.  But we need to keep it
590                  * ticking in case the signal is deliverable next time.
591                  */
592                 posix_cpu_timer_schedule(timer);
593         }
594 }
595
596 /*
597  * Sample a process (thread group) timer for the given group_leader task.
598  * Must be called with tasklist_lock held for reading.
599  */
600 static int cpu_timer_sample_group(const clockid_t which_clock,
601                                   struct task_struct *p,
602                                   union cpu_time_count *cpu)
603 {
604         struct task_cputime cputime;
605
606         thread_group_cputimer(p, &cputime);
607         switch (CPUCLOCK_WHICH(which_clock)) {
608         default:
609                 return -EINVAL;
610         case CPUCLOCK_PROF:
611                 cpu->cpu = cputime.utime + cputime.stime;
612                 break;
613         case CPUCLOCK_VIRT:
614                 cpu->cpu = cputime.utime;
615                 break;
616         case CPUCLOCK_SCHED:
617                 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
618                 break;
619         }
620         return 0;
621 }
622
623 /*
624  * Guts of sys_timer_settime for CPU timers.
625  * This is called with the timer locked and interrupts disabled.
626  * If we return TIMER_RETRY, it's necessary to release the timer's lock
627  * and try again.  (This happens when the timer is in the middle of firing.)
628  */
629 static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
630                                struct itimerspec *new, struct itimerspec *old)
631 {
632         struct task_struct *p = timer->it.cpu.task;
633         union cpu_time_count old_expires, new_expires, old_incr, val;
634         int ret;
635
636         if (unlikely(p == NULL)) {
637                 /*
638                  * Timer refers to a dead task's clock.
639                  */
640                 return -ESRCH;
641         }
642
643         new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
644
645         read_lock(&tasklist_lock);
646         /*
647          * We need the tasklist_lock to protect against reaping that
648          * clears p->sighand.  If p has just been reaped, we can no
649          * longer get any information about it at all.
650          */
651         if (unlikely(p->sighand == NULL)) {
652                 read_unlock(&tasklist_lock);
653                 put_task_struct(p);
654                 timer->it.cpu.task = NULL;
655                 return -ESRCH;
656         }
657
658         /*
659          * Disarm any old timer after extracting its expiry time.
660          */
661         BUG_ON(!irqs_disabled());
662
663         ret = 0;
664         old_incr = timer->it.cpu.incr;
665         spin_lock(&p->sighand->siglock);
666         old_expires = timer->it.cpu.expires;
667         if (unlikely(timer->it.cpu.firing)) {
668                 timer->it.cpu.firing = -1;
669                 ret = TIMER_RETRY;
670         } else
671                 list_del_init(&timer->it.cpu.entry);
672
673         /*
674          * We need to sample the current value to convert the new
675          * value from to relative and absolute, and to convert the
676          * old value from absolute to relative.  To set a process
677          * timer, we need a sample to balance the thread expiry
678          * times (in arm_timer).  With an absolute time, we must
679          * check if it's already passed.  In short, we need a sample.
680          */
681         if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
682                 cpu_clock_sample(timer->it_clock, p, &val);
683         } else {
684                 cpu_timer_sample_group(timer->it_clock, p, &val);
685         }
686
687         if (old) {
688                 if (old_expires.sched == 0) {
689                         old->it_value.tv_sec = 0;
690                         old->it_value.tv_nsec = 0;
691                 } else {
692                         /*
693                          * Update the timer in case it has
694                          * overrun already.  If it has,
695                          * we'll report it as having overrun
696                          * and with the next reloaded timer
697                          * already ticking, though we are
698                          * swallowing that pending
699                          * notification here to install the
700                          * new setting.
701                          */
702                         bump_cpu_timer(timer, val);
703                         if (cpu_time_before(timer->it_clock, val,
704                                             timer->it.cpu.expires)) {
705                                 old_expires = cpu_time_sub(
706                                         timer->it_clock,
707                                         timer->it.cpu.expires, val);
708                                 sample_to_timespec(timer->it_clock,
709                                                    old_expires,
710                                                    &old->it_value);
711                         } else {
712                                 old->it_value.tv_nsec = 1;
713                                 old->it_value.tv_sec = 0;
714                         }
715                 }
716         }
717
718         if (unlikely(ret)) {
719                 /*
720                  * We are colliding with the timer actually firing.
721                  * Punt after filling in the timer's old value, and
722                  * disable this firing since we are already reporting
723                  * it as an overrun (thanks to bump_cpu_timer above).
724                  */
725                 spin_unlock(&p->sighand->siglock);
726                 read_unlock(&tasklist_lock);
727                 goto out;
728         }
729
730         if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
731                 cpu_time_add(timer->it_clock, &new_expires, val);
732         }
733
734         /*
735          * Install the new expiry time (or zero).
736          * For a timer with no notification action, we don't actually
737          * arm the timer (we'll just fake it for timer_gettime).
738          */
739         timer->it.cpu.expires = new_expires;
740         if (new_expires.sched != 0 &&
741             cpu_time_before(timer->it_clock, val, new_expires)) {
742                 arm_timer(timer);
743         }
744
745         spin_unlock(&p->sighand->siglock);
746         read_unlock(&tasklist_lock);
747
748         /*
749          * Install the new reload setting, and
750          * set up the signal and overrun bookkeeping.
751          */
752         timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
753                                                 &new->it_interval);
754
755         /*
756          * This acts as a modification timestamp for the timer,
757          * so any automatic reload attempt will punt on seeing
758          * that we have reset the timer manually.
759          */
760         timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
761                 ~REQUEUE_PENDING;
762         timer->it_overrun_last = 0;
763         timer->it_overrun = -1;
764
765         if (new_expires.sched != 0 &&
766             !cpu_time_before(timer->it_clock, val, new_expires)) {
767                 /*
768                  * The designated time already passed, so we notify
769                  * immediately, even if the thread never runs to
770                  * accumulate more time on this clock.
771                  */
772                 cpu_timer_fire(timer);
773         }
774
775         ret = 0;
776  out:
777         if (old) {
778                 sample_to_timespec(timer->it_clock,
779                                    old_incr, &old->it_interval);
780         }
781         return ret;
782 }
783
784 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
785 {
786         union cpu_time_count now;
787         struct task_struct *p = timer->it.cpu.task;
788         int clear_dead;
789
790         /*
791          * Easy part: convert the reload time.
792          */
793         sample_to_timespec(timer->it_clock,
794                            timer->it.cpu.incr, &itp->it_interval);
795
796         if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all.  */
797                 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
798                 return;
799         }
800
801         if (unlikely(p == NULL)) {
802                 /*
803                  * This task already died and the timer will never fire.
804                  * In this case, expires is actually the dead value.
805                  */
806         dead:
807                 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
808                                    &itp->it_value);
809                 return;
810         }
811
812         /*
813          * Sample the clock to take the difference with the expiry time.
814          */
815         if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
816                 cpu_clock_sample(timer->it_clock, p, &now);
817                 clear_dead = p->exit_state;
818         } else {
819                 read_lock(&tasklist_lock);
820                 if (unlikely(p->sighand == NULL)) {
821                         /*
822                          * The process has been reaped.
823                          * We can't even collect a sample any more.
824                          * Call the timer disarmed, nothing else to do.
825                          */
826                         put_task_struct(p);
827                         timer->it.cpu.task = NULL;
828                         timer->it.cpu.expires.sched = 0;
829                         read_unlock(&tasklist_lock);
830                         goto dead;
831                 } else {
832                         cpu_timer_sample_group(timer->it_clock, p, &now);
833                         clear_dead = (unlikely(p->exit_state) &&
834                                       thread_group_empty(p));
835                 }
836                 read_unlock(&tasklist_lock);
837         }
838
839         if (unlikely(clear_dead)) {
840                 /*
841                  * We've noticed that the thread is dead, but
842                  * not yet reaped.  Take this opportunity to
843                  * drop our task ref.
844                  */
845                 clear_dead_task(timer, now);
846                 goto dead;
847         }
848
849         if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
850                 sample_to_timespec(timer->it_clock,
851                                    cpu_time_sub(timer->it_clock,
852                                                 timer->it.cpu.expires, now),
853                                    &itp->it_value);
854         } else {
855                 /*
856                  * The timer should have expired already, but the firing
857                  * hasn't taken place yet.  Say it's just about to expire.
858                  */
859                 itp->it_value.tv_nsec = 1;
860                 itp->it_value.tv_sec = 0;
861         }
862 }
863
864 /*
865  * Check for any per-thread CPU timers that have fired and move them off
866  * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
867  * tsk->it_*_expires values to reflect the remaining thread CPU timers.
868  */
869 static void check_thread_timers(struct task_struct *tsk,
870                                 struct list_head *firing)
871 {
872         int maxfire;
873         struct list_head *timers = tsk->cpu_timers;
874         struct signal_struct *const sig = tsk->signal;
875         unsigned long soft;
876
877         maxfire = 20;
878         tsk->cputime_expires.prof_exp = 0;
879         while (!list_empty(timers)) {
880                 struct cpu_timer_list *t = list_first_entry(timers,
881                                                       struct cpu_timer_list,
882                                                       entry);
883                 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
884                         tsk->cputime_expires.prof_exp = t->expires.cpu;
885                         break;
886                 }
887                 t->firing = 1;
888                 list_move_tail(&t->entry, firing);
889         }
890
891         ++timers;
892         maxfire = 20;
893         tsk->cputime_expires.virt_exp = 0;
894         while (!list_empty(timers)) {
895                 struct cpu_timer_list *t = list_first_entry(timers,
896                                                       struct cpu_timer_list,
897                                                       entry);
898                 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
899                         tsk->cputime_expires.virt_exp = t->expires.cpu;
900                         break;
901                 }
902                 t->firing = 1;
903                 list_move_tail(&t->entry, firing);
904         }
905
906         ++timers;
907         maxfire = 20;
908         tsk->cputime_expires.sched_exp = 0;
909         while (!list_empty(timers)) {
910                 struct cpu_timer_list *t = list_first_entry(timers,
911                                                       struct cpu_timer_list,
912                                                       entry);
913                 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
914                         tsk->cputime_expires.sched_exp = t->expires.sched;
915                         break;
916                 }
917                 t->firing = 1;
918                 list_move_tail(&t->entry, firing);
919         }
920
921         /*
922          * Check for the special case thread timers.
923          */
924         soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
925         if (soft != RLIM_INFINITY) {
926                 unsigned long hard =
927                         ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
928
929                 if (hard != RLIM_INFINITY &&
930                     tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
931                         /*
932                          * At the hard limit, we just die.
933                          * No need to calculate anything else now.
934                          */
935                         __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
936                         return;
937                 }
938                 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
939                         /*
940                          * At the soft limit, send a SIGXCPU every second.
941                          */
942                         if (soft < hard) {
943                                 soft += USEC_PER_SEC;
944                                 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
945                         }
946                         printk(KERN_INFO
947                                 "RT Watchdog Timeout: %s[%d]\n",
948                                 tsk->comm, task_pid_nr(tsk));
949                         __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
950                 }
951         }
952 }
953
954 static void stop_process_timers(struct signal_struct *sig)
955 {
956         struct thread_group_cputimer *cputimer = &sig->cputimer;
957         unsigned long flags;
958
959         raw_spin_lock_irqsave(&cputimer->lock, flags);
960         cputimer->running = 0;
961         raw_spin_unlock_irqrestore(&cputimer->lock, flags);
962 }
963
964 static u32 onecputick;
965
966 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
967                              cputime_t *expires, cputime_t cur_time, int signo)
968 {
969         if (!it->expires)
970                 return;
971
972         if (cur_time >= it->expires) {
973                 if (it->incr) {
974                         it->expires += it->incr;
975                         it->error += it->incr_error;
976                         if (it->error >= onecputick) {
977                                 it->expires -= cputime_one_jiffy;
978                                 it->error -= onecputick;
979                         }
980                 } else {
981                         it->expires = 0;
982                 }
983
984                 trace_itimer_expire(signo == SIGPROF ?
985                                     ITIMER_PROF : ITIMER_VIRTUAL,
986                                     tsk->signal->leader_pid, cur_time);
987                 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
988         }
989
990         if (it->expires && (!*expires || it->expires < *expires)) {
991                 *expires = it->expires;
992         }
993 }
994
995 /**
996  * task_cputime_zero - Check a task_cputime struct for all zero fields.
997  *
998  * @cputime:    The struct to compare.
999  *
1000  * Checks @cputime to see if all fields are zero.  Returns true if all fields
1001  * are zero, false if any field is nonzero.
1002  */
1003 static inline int task_cputime_zero(const struct task_cputime *cputime)
1004 {
1005         if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1006                 return 1;
1007         return 0;
1008 }
1009
1010 /*
1011  * Check for any per-thread CPU timers that have fired and move them
1012  * off the tsk->*_timers list onto the firing list.  Per-thread timers
1013  * have already been taken off.
1014  */
1015 static void check_process_timers(struct task_struct *tsk,
1016                                  struct list_head *firing)
1017 {
1018         int maxfire;
1019         struct signal_struct *const sig = tsk->signal;
1020         cputime_t utime, ptime, virt_expires, prof_expires;
1021         unsigned long long sum_sched_runtime, sched_expires;
1022         struct list_head *timers = sig->cpu_timers;
1023         struct task_cputime cputime;
1024         unsigned long soft;
1025
1026         /*
1027          * Collect the current process totals.
1028          */
1029         thread_group_cputimer(tsk, &cputime);
1030         utime = cputime.utime;
1031         ptime = utime + cputime.stime;
1032         sum_sched_runtime = cputime.sum_exec_runtime;
1033         maxfire = 20;
1034         prof_expires = 0;
1035         while (!list_empty(timers)) {
1036                 struct cpu_timer_list *tl = list_first_entry(timers,
1037                                                       struct cpu_timer_list,
1038                                                       entry);
1039                 if (!--maxfire || ptime < tl->expires.cpu) {
1040                         prof_expires = tl->expires.cpu;
1041                         break;
1042                 }
1043                 tl->firing = 1;
1044                 list_move_tail(&tl->entry, firing);
1045         }
1046
1047         ++timers;
1048         maxfire = 20;
1049         virt_expires = 0;
1050         while (!list_empty(timers)) {
1051                 struct cpu_timer_list *tl = list_first_entry(timers,
1052                                                       struct cpu_timer_list,
1053                                                       entry);
1054                 if (!--maxfire || utime < tl->expires.cpu) {
1055                         virt_expires = tl->expires.cpu;
1056                         break;
1057                 }
1058                 tl->firing = 1;
1059                 list_move_tail(&tl->entry, firing);
1060         }
1061
1062         ++timers;
1063         maxfire = 20;
1064         sched_expires = 0;
1065         while (!list_empty(timers)) {
1066                 struct cpu_timer_list *tl = list_first_entry(timers,
1067                                                       struct cpu_timer_list,
1068                                                       entry);
1069                 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1070                         sched_expires = tl->expires.sched;
1071                         break;
1072                 }
1073                 tl->firing = 1;
1074                 list_move_tail(&tl->entry, firing);
1075         }
1076
1077         /*
1078          * Check for the special case process timers.
1079          */
1080         check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1081                          SIGPROF);
1082         check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1083                          SIGVTALRM);
1084         soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1085         if (soft != RLIM_INFINITY) {
1086                 unsigned long psecs = cputime_to_secs(ptime);
1087                 unsigned long hard =
1088                         ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1089                 cputime_t x;
1090                 if (psecs >= hard) {
1091                         /*
1092                          * At the hard limit, we just die.
1093                          * No need to calculate anything else now.
1094                          */
1095                         __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1096                         return;
1097                 }
1098                 if (psecs >= soft) {
1099                         /*
1100                          * At the soft limit, send a SIGXCPU every second.
1101                          */
1102                         __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1103                         if (soft < hard) {
1104                                 soft++;
1105                                 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1106                         }
1107                 }
1108                 x = secs_to_cputime(soft);
1109                 if (!prof_expires || x < prof_expires) {
1110                         prof_expires = x;
1111                 }
1112         }
1113
1114         sig->cputime_expires.prof_exp = prof_expires;
1115         sig->cputime_expires.virt_exp = virt_expires;
1116         sig->cputime_expires.sched_exp = sched_expires;
1117         if (task_cputime_zero(&sig->cputime_expires))
1118                 stop_process_timers(sig);
1119 }
1120
1121 /*
1122  * This is called from the signal code (via do_schedule_next_timer)
1123  * when the last timer signal was delivered and we have to reload the timer.
1124  */
1125 void posix_cpu_timer_schedule(struct k_itimer *timer)
1126 {
1127         struct task_struct *p = timer->it.cpu.task;
1128         union cpu_time_count now;
1129
1130         if (unlikely(p == NULL))
1131                 /*
1132                  * The task was cleaned up already, no future firings.
1133                  */
1134                 goto out;
1135
1136         /*
1137          * Fetch the current sample and update the timer's expiry time.
1138          */
1139         if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1140                 cpu_clock_sample(timer->it_clock, p, &now);
1141                 bump_cpu_timer(timer, now);
1142                 if (unlikely(p->exit_state)) {
1143                         clear_dead_task(timer, now);
1144                         goto out;
1145                 }
1146                 read_lock(&tasklist_lock); /* arm_timer needs it.  */
1147                 spin_lock(&p->sighand->siglock);
1148         } else {
1149                 read_lock(&tasklist_lock);
1150                 if (unlikely(p->sighand == NULL)) {
1151                         /*
1152                          * The process has been reaped.
1153                          * We can't even collect a sample any more.
1154                          */
1155                         put_task_struct(p);
1156                         timer->it.cpu.task = p = NULL;
1157                         timer->it.cpu.expires.sched = 0;
1158                         goto out_unlock;
1159                 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1160                         /*
1161                          * We've noticed that the thread is dead, but
1162                          * not yet reaped.  Take this opportunity to
1163                          * drop our task ref.
1164                          */
1165                         clear_dead_task(timer, now);
1166                         goto out_unlock;
1167                 }
1168                 spin_lock(&p->sighand->siglock);
1169                 cpu_timer_sample_group(timer->it_clock, p, &now);
1170                 bump_cpu_timer(timer, now);
1171                 /* Leave the tasklist_lock locked for the call below.  */
1172         }
1173
1174         /*
1175          * Now re-arm for the new expiry time.
1176          */
1177         BUG_ON(!irqs_disabled());
1178         arm_timer(timer);
1179         spin_unlock(&p->sighand->siglock);
1180
1181 out_unlock:
1182         read_unlock(&tasklist_lock);
1183
1184 out:
1185         timer->it_overrun_last = timer->it_overrun;
1186         timer->it_overrun = -1;
1187         ++timer->it_requeue_pending;
1188 }
1189
1190 /**
1191  * task_cputime_expired - Compare two task_cputime entities.
1192  *
1193  * @sample:     The task_cputime structure to be checked for expiration.
1194  * @expires:    Expiration times, against which @sample will be checked.
1195  *
1196  * Checks @sample against @expires to see if any field of @sample has expired.
1197  * Returns true if any field of the former is greater than the corresponding
1198  * field of the latter if the latter field is set.  Otherwise returns false.
1199  */
1200 static inline int task_cputime_expired(const struct task_cputime *sample,
1201                                         const struct task_cputime *expires)
1202 {
1203         if (expires->utime && sample->utime >= expires->utime)
1204                 return 1;
1205         if (expires->stime && sample->utime + sample->stime >= expires->stime)
1206                 return 1;
1207         if (expires->sum_exec_runtime != 0 &&
1208             sample->sum_exec_runtime >= expires->sum_exec_runtime)
1209                 return 1;
1210         return 0;
1211 }
1212
1213 /**
1214  * fastpath_timer_check - POSIX CPU timers fast path.
1215  *
1216  * @tsk:        The task (thread) being checked.
1217  *
1218  * Check the task and thread group timers.  If both are zero (there are no
1219  * timers set) return false.  Otherwise snapshot the task and thread group
1220  * timers and compare them with the corresponding expiration times.  Return
1221  * true if a timer has expired, else return false.
1222  */
1223 static inline int fastpath_timer_check(struct task_struct *tsk)
1224 {
1225         struct signal_struct *sig;
1226
1227         if (!task_cputime_zero(&tsk->cputime_expires)) {
1228                 struct task_cputime task_sample = {
1229                         .utime = tsk->utime,
1230                         .stime = tsk->stime,
1231                         .sum_exec_runtime = tsk->se.sum_exec_runtime
1232                 };
1233
1234                 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1235                         return 1;
1236         }
1237
1238         sig = tsk->signal;
1239         if (sig->cputimer.running) {
1240                 struct task_cputime group_sample;
1241
1242                 raw_spin_lock(&sig->cputimer.lock);
1243                 group_sample = sig->cputimer.cputime;
1244                 raw_spin_unlock(&sig->cputimer.lock);
1245
1246                 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1247                         return 1;
1248         }
1249
1250         return 0;
1251 }
1252
1253 /*
1254  * This is called from the timer interrupt handler.  The irq handler has
1255  * already updated our counts.  We need to check if any timers fire now.
1256  * Interrupts are disabled.
1257  */
1258 void run_posix_cpu_timers(struct task_struct *tsk)
1259 {
1260         LIST_HEAD(firing);
1261         struct k_itimer *timer, *next;
1262         unsigned long flags;
1263
1264         BUG_ON(!irqs_disabled());
1265
1266         /*
1267          * The fast path checks that there are no expired thread or thread
1268          * group timers.  If that's so, just return.
1269          */
1270         if (!fastpath_timer_check(tsk))
1271                 return;
1272
1273         if (!lock_task_sighand(tsk, &flags))
1274                 return;
1275         /*
1276          * Here we take off tsk->signal->cpu_timers[N] and
1277          * tsk->cpu_timers[N] all the timers that are firing, and
1278          * put them on the firing list.
1279          */
1280         check_thread_timers(tsk, &firing);
1281         /*
1282          * If there are any active process wide timers (POSIX 1.b, itimers,
1283          * RLIMIT_CPU) cputimer must be running.
1284          */
1285         if (tsk->signal->cputimer.running)
1286                 check_process_timers(tsk, &firing);
1287
1288         /*
1289          * We must release these locks before taking any timer's lock.
1290          * There is a potential race with timer deletion here, as the
1291          * siglock now protects our private firing list.  We have set
1292          * the firing flag in each timer, so that a deletion attempt
1293          * that gets the timer lock before we do will give it up and
1294          * spin until we've taken care of that timer below.
1295          */
1296         unlock_task_sighand(tsk, &flags);
1297
1298         /*
1299          * Now that all the timers on our list have the firing flag,
1300          * no one will touch their list entries but us.  We'll take
1301          * each timer's lock before clearing its firing flag, so no
1302          * timer call will interfere.
1303          */
1304         list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1305                 int cpu_firing;
1306
1307                 spin_lock(&timer->it_lock);
1308                 list_del_init(&timer->it.cpu.entry);
1309                 cpu_firing = timer->it.cpu.firing;
1310                 timer->it.cpu.firing = 0;
1311                 /*
1312                  * The firing flag is -1 if we collided with a reset
1313                  * of the timer, which already reported this
1314                  * almost-firing as an overrun.  So don't generate an event.
1315                  */
1316                 if (likely(cpu_firing >= 0))
1317                         cpu_timer_fire(timer);
1318                 spin_unlock(&timer->it_lock);
1319         }
1320 }
1321
1322 /*
1323  * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1324  * The tsk->sighand->siglock must be held by the caller.
1325  */
1326 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1327                            cputime_t *newval, cputime_t *oldval)
1328 {
1329         union cpu_time_count now;
1330
1331         BUG_ON(clock_idx == CPUCLOCK_SCHED);
1332         cpu_timer_sample_group(clock_idx, tsk, &now);
1333
1334         if (oldval) {
1335                 /*
1336                  * We are setting itimer. The *oldval is absolute and we update
1337                  * it to be relative, *newval argument is relative and we update
1338                  * it to be absolute.
1339                  */
1340                 if (*oldval) {
1341                         if (*oldval <= now.cpu) {
1342                                 /* Just about to fire. */
1343                                 *oldval = cputime_one_jiffy;
1344                         } else {
1345                                 *oldval -= now.cpu;
1346                         }
1347                 }
1348
1349                 if (!*newval)
1350                         return;
1351                 *newval += now.cpu;
1352         }
1353
1354         /*
1355          * Update expiration cache if we are the earliest timer, or eventually
1356          * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1357          */
1358         switch (clock_idx) {
1359         case CPUCLOCK_PROF:
1360                 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1361                         tsk->signal->cputime_expires.prof_exp = *newval;
1362                 break;
1363         case CPUCLOCK_VIRT:
1364                 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1365                         tsk->signal->cputime_expires.virt_exp = *newval;
1366                 break;
1367         }
1368 }
1369
1370 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1371                             struct timespec *rqtp, struct itimerspec *it)
1372 {
1373         struct k_itimer timer;
1374         int error;
1375
1376         /*
1377          * Set up a temporary timer and then wait for it to go off.
1378          */
1379         memset(&timer, 0, sizeof timer);
1380         spin_lock_init(&timer.it_lock);
1381         timer.it_clock = which_clock;
1382         timer.it_overrun = -1;
1383         error = posix_cpu_timer_create(&timer);
1384         timer.it_process = current;
1385         if (!error) {
1386                 static struct itimerspec zero_it;
1387
1388                 memset(it, 0, sizeof *it);
1389                 it->it_value = *rqtp;
1390
1391                 spin_lock_irq(&timer.it_lock);
1392                 error = posix_cpu_timer_set(&timer, flags, it, NULL);
1393                 if (error) {
1394                         spin_unlock_irq(&timer.it_lock);
1395                         return error;
1396                 }
1397
1398                 while (!signal_pending(current)) {
1399                         if (timer.it.cpu.expires.sched == 0) {
1400                                 /*
1401                                  * Our timer fired and was reset.
1402                                  */
1403                                 spin_unlock_irq(&timer.it_lock);
1404                                 return 0;
1405                         }
1406
1407                         /*
1408                          * Block until cpu_timer_fire (or a signal) wakes us.
1409                          */
1410                         __set_current_state(TASK_INTERRUPTIBLE);
1411                         spin_unlock_irq(&timer.it_lock);
1412                         schedule();
1413                         spin_lock_irq(&timer.it_lock);
1414                 }
1415
1416                 /*
1417                  * We were interrupted by a signal.
1418                  */
1419                 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1420                 posix_cpu_timer_set(&timer, 0, &zero_it, it);
1421                 spin_unlock_irq(&timer.it_lock);
1422
1423                 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1424                         /*
1425                          * It actually did fire already.
1426                          */
1427                         return 0;
1428                 }
1429
1430                 error = -ERESTART_RESTARTBLOCK;
1431         }
1432
1433         return error;
1434 }
1435
1436 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1437
1438 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1439                             struct timespec *rqtp, struct timespec __user *rmtp)
1440 {
1441         struct restart_block *restart_block =
1442                 &current_thread_info()->restart_block;
1443         struct itimerspec it;
1444         int error;
1445
1446         /*
1447          * Diagnose required errors first.
1448          */
1449         if (CPUCLOCK_PERTHREAD(which_clock) &&
1450             (CPUCLOCK_PID(which_clock) == 0 ||
1451              CPUCLOCK_PID(which_clock) == current->pid))
1452                 return -EINVAL;
1453
1454         error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1455
1456         if (error == -ERESTART_RESTARTBLOCK) {
1457
1458                 if (flags & TIMER_ABSTIME)
1459                         return -ERESTARTNOHAND;
1460                 /*
1461                  * Report back to the user the time still remaining.
1462                  */
1463                 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1464                         return -EFAULT;
1465
1466                 restart_block->fn = posix_cpu_nsleep_restart;
1467                 restart_block->nanosleep.clockid = which_clock;
1468                 restart_block->nanosleep.rmtp = rmtp;
1469                 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1470         }
1471         return error;
1472 }
1473
1474 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1475 {
1476         clockid_t which_clock = restart_block->nanosleep.clockid;
1477         struct timespec t;
1478         struct itimerspec it;
1479         int error;
1480
1481         t = ns_to_timespec(restart_block->nanosleep.expires);
1482
1483         error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1484
1485         if (error == -ERESTART_RESTARTBLOCK) {
1486                 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1487                 /*
1488                  * Report back to the user the time still remaining.
1489                  */
1490                 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1491                         return -EFAULT;
1492
1493                 restart_block->nanosleep.expires = timespec_to_ns(&t);
1494         }
1495         return error;
1496
1497 }
1498
1499 #define PROCESS_CLOCK   MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1500 #define THREAD_CLOCK    MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1501
1502 static int process_cpu_clock_getres(const clockid_t which_clock,
1503                                     struct timespec *tp)
1504 {
1505         return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1506 }
1507 static int process_cpu_clock_get(const clockid_t which_clock,
1508                                  struct timespec *tp)
1509 {
1510         return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1511 }
1512 static int process_cpu_timer_create(struct k_itimer *timer)
1513 {
1514         timer->it_clock = PROCESS_CLOCK;
1515         return posix_cpu_timer_create(timer);
1516 }
1517 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1518                               struct timespec *rqtp,
1519                               struct timespec __user *rmtp)
1520 {
1521         return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1522 }
1523 static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1524 {
1525         return -EINVAL;
1526 }
1527 static int thread_cpu_clock_getres(const clockid_t which_clock,
1528                                    struct timespec *tp)
1529 {
1530         return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1531 }
1532 static int thread_cpu_clock_get(const clockid_t which_clock,
1533                                 struct timespec *tp)
1534 {
1535         return posix_cpu_clock_get(THREAD_CLOCK, tp);
1536 }
1537 static int thread_cpu_timer_create(struct k_itimer *timer)
1538 {
1539         timer->it_clock = THREAD_CLOCK;
1540         return posix_cpu_timer_create(timer);
1541 }
1542
1543 struct k_clock clock_posix_cpu = {
1544         .clock_getres   = posix_cpu_clock_getres,
1545         .clock_set      = posix_cpu_clock_set,
1546         .clock_get      = posix_cpu_clock_get,
1547         .timer_create   = posix_cpu_timer_create,
1548         .nsleep         = posix_cpu_nsleep,
1549         .nsleep_restart = posix_cpu_nsleep_restart,
1550         .timer_set      = posix_cpu_timer_set,
1551         .timer_del      = posix_cpu_timer_del,
1552         .timer_get      = posix_cpu_timer_get,
1553 };
1554
1555 static __init int init_posix_cpu_timers(void)
1556 {
1557         struct k_clock process = {
1558                 .clock_getres   = process_cpu_clock_getres,
1559                 .clock_get      = process_cpu_clock_get,
1560                 .timer_create   = process_cpu_timer_create,
1561                 .nsleep         = process_cpu_nsleep,
1562                 .nsleep_restart = process_cpu_nsleep_restart,
1563         };
1564         struct k_clock thread = {
1565                 .clock_getres   = thread_cpu_clock_getres,
1566                 .clock_get      = thread_cpu_clock_get,
1567                 .timer_create   = thread_cpu_timer_create,
1568         };
1569         struct timespec ts;
1570
1571         posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1572         posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1573
1574         cputime_to_timespec(cputime_one_jiffy, &ts);
1575         onecputick = ts.tv_nsec;
1576         WARN_ON(ts.tv_sec != 0);
1577
1578         return 0;
1579 }
1580 __initcall(init_posix_cpu_timers);