kernel/sched/cputime.c

   1 #include <linux/export.h>
   2 #include <linux/sched.h>
   3 #include <linux/tsacct_kern.h>
   4 #include <linux/kernel_stat.h>
   5 #include <linux/static_key.h>
   6 #include <linux/context_tracking.h>
   7 #include "sched.h"
   8 #ifdef CONFIG_PARAVIRT
   9 #include <asm/paravirt.h>
  10 #endif
  11
  12
  13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  14
  15 /*
  16  * There are no locks covering percpu hardirq/softirq time.
  17  * They are only modified in vtime_account, on corresponding CPU
  18  * with interrupts disabled. So, writes are safe.
  19  * They are read and saved off onto struct rq in update_rq_clock().
  20  * This may result in other CPU reading this CPU's irq time and can
  21  * race with irq/vtime_account on this CPU. We would either get old
  22  * or new value with a side effect of accounting a slice of irq time to wrong
  23  * task when irq is in progress while we read rq->clock. That is a worthy
  24  * compromise in place of having locks on each irq in account_system_time.
  25  */
  26 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  27
  28 static int sched_clock_irqtime;
  29
  30 void enable_sched_clock_irqtime(void)
  31 {
  32         sched_clock_irqtime = 1;
  33 }
  34
  35 void disable_sched_clock_irqtime(void)
  36 {
  37         sched_clock_irqtime = 0;
  38 }
  39
  40 /*
  41  * Called before incrementing preempt_count on {soft,}irq_enter
  42  * and before decrementing preempt_count on {soft,}irq_exit.
  43  */
  44 void irqtime_account_irq(struct task_struct *curr)
  45 {
  46         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  47         s64 delta;
  48         int cpu;
  49
  50         if (!sched_clock_irqtime)
  51                 return;
  52
  53         cpu = smp_processor_id();
  54         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
  55         irqtime->irq_start_time += delta;
  56
  57         u64_stats_update_begin(&irqtime->sync);
  58         /*
  59          * We do not account for softirq time from ksoftirqd here.
  60          * We want to continue accounting softirq time to ksoftirqd thread
  61          * in that case, so as not to confuse scheduler with a special task
  62          * that do not consume any time, but still wants to run.
  63          */
  64         if (hardirq_count())
  65                 irqtime->hardirq_time += delta;
  66         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  67                 irqtime->softirq_time += delta;
  68
  69         u64_stats_update_end(&irqtime->sync);
  70 }
  71 EXPORT_SYMBOL_GPL(irqtime_account_irq);
  72
  73 static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
  74 {
  75         u64 *cpustat = kcpustat_this_cpu->cpustat;
  76         cputime_t irq_cputime;
  77
  78         irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
  79         irq_cputime = min(irq_cputime, maxtime);
  80         cpustat[idx] += irq_cputime;
  81
  82         return irq_cputime;
  83 }
  84
  85 static cputime_t irqtime_account_hi_update(cputime_t maxtime)
  86 {
  87         return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
  88                                       CPUTIME_IRQ, maxtime);
  89 }
  90
  91 static cputime_t irqtime_account_si_update(cputime_t maxtime)
  92 {
  93         return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
  94                                       CPUTIME_SOFTIRQ, maxtime);
  95 }
  96
  97 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  98
  99 #define sched_clock_irqtime     (0)
 100
 101 static cputime_t irqtime_account_hi_update(cputime_t dummy)
 102 {
 103         return 0;
 104 }
 105
 106 static cputime_t irqtime_account_si_update(cputime_t dummy)
 107 {
 108         return 0;
 109 }
 110
 111 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 112
 113 static inline void task_group_account_field(struct task_struct *p, int index,
 114                                             u64 tmp)
 115 {
 116         /*
 117          * Since all updates are sure to touch the root cgroup, we
 118          * get ourselves ahead and touch it first. If the root cgroup
 119          * is the only cgroup, then nothing else should be necessary.
 120          *
 121          */
 122         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 123
 124         cpuacct_account_field(p, index, tmp);
 125 }
 126
 127 /*
 128  * Account user cpu time to a process.
 129  * @p: the process that the cpu time gets accounted to
 130  * @cputime: the cpu time spent in user space since the last update
 131  * @cputime_scaled: cputime scaled by cpu frequency
 132  */
 133 void account_user_time(struct task_struct *p, cputime_t cputime,
 134                        cputime_t cputime_scaled)
 135 {
 136         int index;
 137
 138         /* Add user time to process. */
 139         p->utime += cputime;
 140         p->utimescaled += cputime_scaled;
 141         account_group_user_time(p, cputime);
 142
 143         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 144
 145         /* Add user time to cpustat. */
 146         task_group_account_field(p, index, (__force u64) cputime);
 147
 148         /* Account for user time used */
 149         acct_account_cputime(p);
 150 }
 151
 152 /*
 153  * Account guest cpu time to a process.
 154  * @p: the process that the cpu time gets accounted to
 155  * @cputime: the cpu time spent in virtual machine since the last update
 156  * @cputime_scaled: cputime scaled by cpu frequency
 157  */
 158 static void account_guest_time(struct task_struct *p, cputime_t cputime,
 159                                cputime_t cputime_scaled)
 160 {
 161         u64 *cpustat = kcpustat_this_cpu->cpustat;
 162
 163         /* Add guest time to process. */
 164         p->utime += cputime;
 165         p->utimescaled += cputime_scaled;
 166         account_group_user_time(p, cputime);
 167         p->gtime += cputime;
 168
 169         /* Add guest time to cpustat. */
 170         if (task_nice(p) > 0) {
 171                 cpustat[CPUTIME_NICE] += (__force u64) cputime;
 172                 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
 173         } else {
 174                 cpustat[CPUTIME_USER] += (__force u64) cputime;
 175                 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
 176         }
 177 }
 178
 179 /*
 180  * Account system cpu time to a process and desired cpustat field
 181  * @p: the process that the cpu time gets accounted to
 182  * @cputime: the cpu time spent in kernel space since the last update
 183  * @cputime_scaled: cputime scaled by cpu frequency
 184  * @target_cputime64: pointer to cpustat field that has to be updated
 185  */
 186 static inline
 187 void __account_system_time(struct task_struct *p, cputime_t cputime,
 188                         cputime_t cputime_scaled, int index)
 189 {
 190         /* Add system time to process. */
 191         p->stime += cputime;
 192         p->stimescaled += cputime_scaled;
 193         account_group_system_time(p, cputime);
 194
 195         /* Add system time to cpustat. */
 196         task_group_account_field(p, index, (__force u64) cputime);
 197
 198         /* Account for system time used */
 199         acct_account_cputime(p);
 200 }
 201
 202 /*
 203  * Account system cpu time to a process.
 204  * @p: the process that the cpu time gets accounted to
 205  * @hardirq_offset: the offset to subtract from hardirq_count()
 206  * @cputime: the cpu time spent in kernel space since the last update
 207  * @cputime_scaled: cputime scaled by cpu frequency
 208  */
 209 void account_system_time(struct task_struct *p, int hardirq_offset,
 210                          cputime_t cputime, cputime_t cputime_scaled)
 211 {
 212         int index;
 213
 214         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 215                 account_guest_time(p, cputime, cputime_scaled);
 216                 return;
 217         }
 218
 219         if (hardirq_count() - hardirq_offset)
 220                 index = CPUTIME_IRQ;
 221         else if (in_serving_softirq())
 222                 index = CPUTIME_SOFTIRQ;
 223         else
 224                 index = CPUTIME_SYSTEM;
 225
 226         __account_system_time(p, cputime, cputime_scaled, index);
 227 }
 228
 229 /*
 230  * Account for involuntary wait time.
 231  * @cputime: the cpu time spent in involuntary wait
 232  */
 233 void account_steal_time(cputime_t cputime)
 234 {
 235         u64 *cpustat = kcpustat_this_cpu->cpustat;
 236
 237         cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 238 }
 239
 240 /*
 241  * Account for idle time.
 242  * @cputime: the cpu time spent in idle wait
 243  */
 244 void account_idle_time(cputime_t cputime)
 245 {
 246         u64 *cpustat = kcpustat_this_cpu->cpustat;
 247         struct rq *rq = this_rq();
 248
 249         if (atomic_read(&rq->nr_iowait) > 0)
 250                 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
 251         else
 252                 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 253 }
 254
 255 /*
 256  * When a guest is interrupted for a longer amount of time, missed clock
 257  * ticks are not redelivered later. Due to that, this function may on
 258  * occasion account more time than the calling functions think elapsed.
 259  */
 260 static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 261 {
 262 #ifdef CONFIG_PARAVIRT
 263         if (static_key_false(&paravirt_steal_enabled)) {
 264                 cputime_t steal_cputime;
 265                 u64 steal;
 266
 267                 steal = paravirt_steal_clock(smp_processor_id());
 268                 steal -= this_rq()->prev_steal_time;
 269
 270                 steal_cputime = min(nsecs_to_cputime(steal), maxtime);
 271                 account_steal_time(steal_cputime);
 272                 this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
 273
 274                 return steal_cputime;
 275         }
 276 #endif
 277         return 0;
 278 }
 279
 280 /*
 281  * Account how much elapsed time was spent in steal, irq, or softirq time.
 282  */
 283 static inline cputime_t account_other_time(cputime_t max)
 284 {
 285         cputime_t accounted;
 286
 287         /* Shall be converted to a lockdep-enabled lightweight check */
 288         WARN_ON_ONCE(!irqs_disabled());
 289
 290         accounted = steal_account_process_time(max);
 291
 292         if (accounted < max)
 293                 accounted += irqtime_account_hi_update(max - accounted);
 294
 295         if (accounted < max)
 296                 accounted += irqtime_account_si_update(max - accounted);
 297
 298         return accounted;
 299 }
 300
 301 #ifdef CONFIG_64BIT
 302 static inline u64 read_sum_exec_runtime(struct task_struct *t)
 303 {
 304         return t->se.sum_exec_runtime;
 305 }
 306 #else
 307 static u64 read_sum_exec_runtime(struct task_struct *t)
 308 {
 309         u64 ns;
 310         struct rq_flags rf;
 311         struct rq *rq;
 312
 313         rq = task_rq_lock(t, &rf);
 314         ns = t->se.sum_exec_runtime;
 315         task_rq_unlock(rq, t, &rf);
 316
 317         return ns;
 318 }
 319 #endif
 320
 321 /*
 322  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 323  * tasks (sum on group iteration) belonging to @tsk's group.
 324  */
 325 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 326 {
 327         struct signal_struct *sig = tsk->signal;
 328         cputime_t utime, stime;
 329         struct task_struct *t;
 330         unsigned int seq, nextseq;
 331         unsigned long flags;
 332
 333         /*
 334          * Update current task runtime to account pending time since last
 335          * scheduler action or thread_group_cputime() call. This thread group
 336          * might have other running tasks on different CPUs, but updating
 337          * their runtime can affect syscall performance, so we skip account
 338          * those pending times and rely only on values updated on tick or
 339          * other scheduler action.
 340          */
 341         if (same_thread_group(current, tsk))
 342                 (void) task_sched_runtime(current);
 343
 344         rcu_read_lock();
 345         /* Attempt a lockless read on the first round. */
 346         nextseq = 0;
 347         do {
 348                 seq = nextseq;
 349                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 350                 times->utime = sig->utime;
 351                 times->stime = sig->stime;
 352                 times->sum_exec_runtime = sig->sum_sched_runtime;
 353
 354                 for_each_thread(tsk, t) {
 355                         task_cputime(t, &utime, &stime);
 356                         times->utime += utime;
 357                         times->stime += stime;
 358                         times->sum_exec_runtime += read_sum_exec_runtime(t);
 359                 }
 360                 /* If lockless access failed, take the lock. */
 361                 nextseq = 1;
 362         } while (need_seqretry(&sig->stats_lock, seq));
 363         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 364         rcu_read_unlock();
 365 }
 366
 367 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 368 /*
 369  * Account a tick to a process and cpustat
 370  * @p: the process that the cpu time gets accounted to
 371  * @user_tick: is the tick from userspace
 372  * @rq: the pointer to rq
 373  *
 374  * Tick demultiplexing follows the order
 375  * - pending hardirq update
 376  * - pending softirq update
 377  * - user_time
 378  * - idle_time
 379  * - system time
 380  *   - check for guest_time
 381  *   - else account as system_time
 382  *
 383  * Check for hardirq is done both for system and user time as there is
 384  * no timer going off while we are on hardirq and hence we may never get an
 385  * opportunity to update it solely in system time.
 386  * p->stime and friends are only updated on system time and not on irq
 387  * softirq as those do not count in task exec_runtime any more.
 388  */
 389 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 390                                          struct rq *rq, int ticks)
 391 {
 392         u64 cputime = (__force u64) cputime_one_jiffy * ticks;
 393         cputime_t scaled, other;
 394
 395         /*
 396          * When returning from idle, many ticks can get accounted at
 397          * once, including some ticks of steal, irq, and softirq time.
 398          * Subtract those ticks from the amount of time accounted to
 399          * idle, or potentially user or system time. Due to rounding,
 400          * other time can exceed ticks occasionally.
 401          */
 402         other = account_other_time(ULONG_MAX);
 403         if (other >= cputime)
 404                 return;
 405         cputime -= other;
 406         scaled = cputime_to_scaled(cputime);
 407
 408         if (this_cpu_ksoftirqd() == p) {
 409                 /*
 410                  * ksoftirqd time do not get accounted in cpu_softirq_time.
 411                  * So, we have to handle it separately here.
 412                  * Also, p->stime needs to be updated for ksoftirqd.
 413                  */
 414                 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
 415         } else if (user_tick) {
 416                 account_user_time(p, cputime, scaled);
 417         } else if (p == rq->idle) {
 418                 account_idle_time(cputime);
 419         } else if (p->flags & PF_VCPU) { /* System time or guest time */
 420                 account_guest_time(p, cputime, scaled);
 421         } else {
 422                 __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
 423         }
 424 }
 425
 426 static void irqtime_account_idle_ticks(int ticks)
 427 {
 428         struct rq *rq = this_rq();
 429
 430         irqtime_account_process_tick(current, 0, rq, ticks);
 431 }
 432 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 433 static inline void irqtime_account_idle_ticks(int ticks) {}
 434 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 435                                                 struct rq *rq, int nr_ticks) {}
 436 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 437
 438 /*
 439  * Use precise platform statistics if available:
 440  */
 441 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 442
 443 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 444 void vtime_common_task_switch(struct task_struct *prev)
 445 {
 446         if (is_idle_task(prev))
 447                 vtime_account_idle(prev);
 448         else
 449                 vtime_account_system(prev);
 450
 451 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 452         vtime_account_user(prev);
 453 #endif
 454         arch_vtime_task_switch(prev);
 455 }
 456 #endif
 457
 458 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 459
 460
 461 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 462 /*
 463  * Archs that account the whole time spent in the idle task
 464  * (outside irq) as idle time can rely on this and just implement
 465  * vtime_account_system() and vtime_account_idle(). Archs that
 466  * have other meaning of the idle time (s390 only includes the
 467  * time spent by the CPU when it's in low power mode) must override
 468  * vtime_account().
 469  */
 470 #ifndef __ARCH_HAS_VTIME_ACCOUNT
 471 void vtime_account_irq_enter(struct task_struct *tsk)
 472 {
 473         if (!in_interrupt() && is_idle_task(tsk))
 474                 vtime_account_idle(tsk);
 475         else
 476                 vtime_account_system(tsk);
 477 }
 478 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 479 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 480
 481 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 482 {
 483         *ut = p->utime;
 484         *st = p->stime;
 485 }
 486 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 487
 488 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 489 {
 490         struct task_cputime cputime;
 491
 492         thread_group_cputime(p, &cputime);
 493
 494         *ut = cputime.utime;
 495         *st = cputime.stime;
 496 }
 497 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 498 /*
 499  * Account a single tick of cpu time.
 500  * @p: the process that the cpu time gets accounted to
 501  * @user_tick: indicates if the tick is a user or a system tick
 502  */
 503 void account_process_tick(struct task_struct *p, int user_tick)
 504 {
 505         cputime_t cputime, scaled, steal;
 506         struct rq *rq = this_rq();
 507
 508         if (vtime_accounting_cpu_enabled())
 509                 return;
 510
 511         if (sched_clock_irqtime) {
 512                 irqtime_account_process_tick(p, user_tick, rq, 1);
 513                 return;
 514         }
 515
 516         cputime = cputime_one_jiffy;
 517         steal = steal_account_process_time(ULONG_MAX);
 518
 519         if (steal >= cputime)
 520                 return;
 521
 522         cputime -= steal;
 523         scaled = cputime_to_scaled(cputime);
 524
 525         if (user_tick)
 526                 account_user_time(p, cputime, scaled);
 527         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 528                 account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
 529         else
 530                 account_idle_time(cputime);
 531 }
 532
 533 /*
 534  * Account multiple ticks of idle time.
 535  * @ticks: number of stolen ticks
 536  */
 537 void account_idle_ticks(unsigned long ticks)
 538 {
 539         cputime_t cputime, steal;
 540
 541         if (sched_clock_irqtime) {
 542                 irqtime_account_idle_ticks(ticks);
 543                 return;
 544         }
 545
 546         cputime = jiffies_to_cputime(ticks);
 547         steal = steal_account_process_time(ULONG_MAX);
 548
 549         if (steal >= cputime)
 550                 return;
 551
 552         cputime -= steal;
 553         account_idle_time(cputime);
 554 }
 555
 556 /*
 557  * Perform (stime * rtime) / total, but avoid multiplication overflow by
 558  * loosing precision when the numbers are big.
 559  */
 560 static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
 561 {
 562         u64 scaled;
 563
 564         for (;;) {
 565                 /* Make sure "rtime" is the bigger of stime/rtime */
 566                 if (stime > rtime)
 567                         swap(rtime, stime);
 568
 569                 /* Make sure 'total' fits in 32 bits */
 570                 if (total >> 32)
 571                         goto drop_precision;
 572
 573                 /* Does rtime (and thus stime) fit in 32 bits? */
 574                 if (!(rtime >> 32))
 575                         break;
 576
 577                 /* Can we just balance rtime/stime rather than dropping bits? */
 578                 if (stime >> 31)
 579                         goto drop_precision;
 580
 581                 /* We can grow stime and shrink rtime and try to make them both fit */
 582                 stime <<= 1;
 583                 rtime >>= 1;
 584                 continue;
 585
 586 drop_precision:
 587                 /* We drop from rtime, it has more bits than stime */
 588                 rtime >>= 1;
 589                 total >>= 1;
 590         }
 591
 592         /*
 593          * Make sure gcc understands that this is a 32x32->64 multiply,
 594          * followed by a 64/32->64 divide.
 595          */
 596         scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
 597         return (__force cputime_t) scaled;
 598 }
 599
 600 /*
 601  * Adjust tick based cputime random precision against scheduler runtime
 602  * accounting.
 603  *
 604  * Tick based cputime accounting depend on random scheduling timeslices of a
 605  * task to be interrupted or not by the timer.  Depending on these
 606  * circumstances, the number of these interrupts may be over or
 607  * under-optimistic, matching the real user and system cputime with a variable
 608  * precision.
 609  *
 610  * Fix this by scaling these tick based values against the total runtime
 611  * accounted by the CFS scheduler.
 612  *
 613  * This code provides the following guarantees:
 614  *
 615  *   stime + utime == rtime
 616  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
 617  *
 618  * Assuming that rtime_i+1 >= rtime_i.
 619  */
 620 static void cputime_adjust(struct task_cputime *curr,
 621                            struct prev_cputime *prev,
 622                            cputime_t *ut, cputime_t *st)
 623 {
 624         cputime_t rtime, stime, utime;
 625         unsigned long flags;
 626
 627         /* Serialize concurrent callers such that we can honour our guarantees */
 628         raw_spin_lock_irqsave(&prev->lock, flags);
 629         rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 630
 631         /*
 632          * This is possible under two circumstances:
 633          *  - rtime isn't monotonic after all (a bug);
 634          *  - we got reordered by the lock.
 635          *
 636          * In both cases this acts as a filter such that the rest of the code
 637          * can assume it is monotonic regardless of anything else.
 638          */
 639         if (prev->stime + prev->utime >= rtime)
 640                 goto out;
 641
 642         stime = curr->stime;
 643         utime = curr->utime;
 644
 645         /*
 646          * If either stime or both stime and utime are 0, assume all runtime is
 647          * userspace. Once a task gets some ticks, the monotonicy code at
 648          * 'update' will ensure things converge to the observed ratio.
 649          */
 650         if (stime == 0) {
 651                 utime = rtime;
 652                 goto update;
 653         }
 654
 655         if (utime == 0) {
 656                 stime = rtime;
 657                 goto update;
 658         }
 659
 660         stime = scale_stime((__force u64)stime, (__force u64)rtime,
 661                             (__force u64)(stime + utime));
 662
 663 update:
 664         /*
 665          * Make sure stime doesn't go backwards; this preserves monotonicity
 666          * for utime because rtime is monotonic.
 667          *
 668          *  utime_i+1 = rtime_i+1 - stime_i
 669          *            = rtime_i+1 - (rtime_i - utime_i)
 670          *            = (rtime_i+1 - rtime_i) + utime_i
 671          *            >= utime_i
 672          */
 673         if (stime < prev->stime)
 674                 stime = prev->stime;
 675         utime = rtime - stime;
 676
 677         /*
 678          * Make sure utime doesn't go backwards; this still preserves
 679          * monotonicity for stime, analogous argument to above.
 680          */
 681         if (utime < prev->utime) {
 682                 utime = prev->utime;
 683                 stime = rtime - utime;
 684         }
 685
 686         prev->stime = stime;
 687         prev->utime = utime;
 688 out:
 689         *ut = prev->utime;
 690         *st = prev->stime;
 691         raw_spin_unlock_irqrestore(&prev->lock, flags);
 692 }
 693
 694 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 695 {
 696         struct task_cputime cputime = {
 697                 .sum_exec_runtime = p->se.sum_exec_runtime,
 698         };
 699
 700         task_cputime(p, &cputime.utime, &cputime.stime);
 701         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 702 }
 703 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 704
 705 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 706 {
 707         struct task_cputime cputime;
 708
 709         thread_group_cputime(p, &cputime);
 710         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 711 }
 712 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 713
 714 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 715 static cputime_t vtime_delta(struct task_struct *tsk)
 716 {
 717         unsigned long now = READ_ONCE(jiffies);
 718
 719         if (time_before(now, (unsigned long)tsk->vtime_snap))
 720                 return 0;
 721
 722         return jiffies_to_cputime(now - tsk->vtime_snap);
 723 }
 724
 725 static cputime_t get_vtime_delta(struct task_struct *tsk)
 726 {
 727         unsigned long now = READ_ONCE(jiffies);
 728         cputime_t delta, other;
 729
 730         /*
 731          * Unlike tick based timing, vtime based timing never has lost
 732          * ticks, and no need for steal time accounting to make up for
 733          * lost ticks. Vtime accounts a rounded version of actual
 734          * elapsed time. Limit account_other_time to prevent rounding
 735          * errors from causing elapsed vtime to go negative.
 736          */
 737         delta = jiffies_to_cputime(now - tsk->vtime_snap);
 738         other = account_other_time(delta);
 739         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 740         tsk->vtime_snap = now;
 741
 742         return delta - other;
 743 }
 744
 745 static void __vtime_account_system(struct task_struct *tsk)
 746 {
 747         cputime_t delta_cpu = get_vtime_delta(tsk);
 748
 749         account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
 750 }
 751
 752 void vtime_account_system(struct task_struct *tsk)
 753 {
 754         if (!vtime_delta(tsk))
 755                 return;
 756
 757         write_seqcount_begin(&tsk->vtime_seqcount);
 758         __vtime_account_system(tsk);
 759         write_seqcount_end(&tsk->vtime_seqcount);
 760 }
 761
 762 void vtime_account_user(struct task_struct *tsk)
 763 {
 764         cputime_t delta_cpu;
 765
 766         write_seqcount_begin(&tsk->vtime_seqcount);
 767         tsk->vtime_snap_whence = VTIME_SYS;
 768         if (vtime_delta(tsk)) {
 769                 delta_cpu = get_vtime_delta(tsk);
 770                 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
 771         }
 772         write_seqcount_end(&tsk->vtime_seqcount);
 773 }
 774
 775 void vtime_user_enter(struct task_struct *tsk)
 776 {
 777         write_seqcount_begin(&tsk->vtime_seqcount);
 778         if (vtime_delta(tsk))
 779                 __vtime_account_system(tsk);
 780         tsk->vtime_snap_whence = VTIME_USER;
 781         write_seqcount_end(&tsk->vtime_seqcount);
 782 }
 783
 784 void vtime_guest_enter(struct task_struct *tsk)
 785 {
 786         /*
 787          * The flags must be updated under the lock with
 788          * the vtime_snap flush and update.
 789          * That enforces a right ordering and update sequence
 790          * synchronization against the reader (task_gtime())
 791          * that can thus safely catch up with a tickless delta.
 792          */
 793         write_seqcount_begin(&tsk->vtime_seqcount);
 794         if (vtime_delta(tsk))
 795                 __vtime_account_system(tsk);
 796         current->flags |= PF_VCPU;
 797         write_seqcount_end(&tsk->vtime_seqcount);
 798 }
 799 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 800
 801 void vtime_guest_exit(struct task_struct *tsk)
 802 {
 803         write_seqcount_begin(&tsk->vtime_seqcount);
 804         __vtime_account_system(tsk);
 805         current->flags &= ~PF_VCPU;
 806         write_seqcount_end(&tsk->vtime_seqcount);
 807 }
 808 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 809
 810 void vtime_account_idle(struct task_struct *tsk)
 811 {
 812         cputime_t delta_cpu = get_vtime_delta(tsk);
 813
 814         account_idle_time(delta_cpu);
 815 }
 816
 817 void arch_vtime_task_switch(struct task_struct *prev)
 818 {
 819         write_seqcount_begin(&prev->vtime_seqcount);
 820         prev->vtime_snap_whence = VTIME_INACTIVE;
 821         write_seqcount_end(&prev->vtime_seqcount);
 822
 823         write_seqcount_begin(&current->vtime_seqcount);
 824         current->vtime_snap_whence = VTIME_SYS;
 825         current->vtime_snap = jiffies;
 826         write_seqcount_end(&current->vtime_seqcount);
 827 }
 828
 829 void vtime_init_idle(struct task_struct *t, int cpu)
 830 {
 831         unsigned long flags;
 832
 833         local_irq_save(flags);
 834         write_seqcount_begin(&t->vtime_seqcount);
 835         t->vtime_snap_whence = VTIME_SYS;
 836         t->vtime_snap = jiffies;
 837         write_seqcount_end(&t->vtime_seqcount);
 838         local_irq_restore(flags);
 839 }
 840
 841 cputime_t task_gtime(struct task_struct *t)
 842 {
 843         unsigned int seq;
 844         cputime_t gtime;
 845
 846         if (!vtime_accounting_enabled())
 847                 return t->gtime;
 848
 849         do {
 850                 seq = read_seqcount_begin(&t->vtime_seqcount);
 851
 852                 gtime = t->gtime;
 853                 if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
 854                         gtime += vtime_delta(t);
 855
 856         } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 857
 858         return gtime;
 859 }
 860
 861 /*
 862  * Fetch cputime raw values from fields of task_struct and
 863  * add up the pending nohz execution time since the last
 864  * cputime snapshot.
 865  */
 866 static void
 867 fetch_task_cputime(struct task_struct *t,
 868                    cputime_t *u_dst, cputime_t *s_dst,
 869                    cputime_t *u_src, cputime_t *s_src,
 870                    cputime_t *udelta, cputime_t *sdelta)
 871 {
 872         unsigned int seq;
 873         unsigned long long delta;
 874
 875         do {
 876                 *udelta = 0;
 877                 *sdelta = 0;
 878
 879                 seq = read_seqcount_begin(&t->vtime_seqcount);
 880
 881                 if (u_dst)
 882                         *u_dst = *u_src;
 883                 if (s_dst)
 884                         *s_dst = *s_src;
 885
 886                 /* Task is sleeping, nothing to add */
 887                 if (t->vtime_snap_whence == VTIME_INACTIVE ||
 888                     is_idle_task(t))
 889                         continue;
 890
 891                 delta = vtime_delta(t);
 892
 893                 /*
 894                  * Task runs either in user or kernel space, add pending nohz time to
 895                  * the right place.
 896                  */
 897                 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
 898                         *udelta = delta;
 899                 } else {
 900                         if (t->vtime_snap_whence == VTIME_SYS)
 901                                 *sdelta = delta;
 902                 }
 903         } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 904 }
 905
 906
 907 void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 908 {
 909         cputime_t udelta, sdelta;
 910
 911         if (!vtime_accounting_enabled()) {
 912                 if (utime)
 913                         *utime = t->utime;
 914                 if (stime)
 915                         *stime = t->stime;
 916                 return;
 917         }
 918
 919         fetch_task_cputime(t, utime, stime, &t->utime,
 920                            &t->stime, &udelta, &sdelta);
 921         if (utime)
 922                 *utime += udelta;
 923         if (stime)
 924                 *stime += sdelta;
 925 }
 926
 927 void task_cputime_scaled(struct task_struct *t,
 928                          cputime_t *utimescaled, cputime_t *stimescaled)
 929 {
 930         cputime_t udelta, sdelta;
 931
 932         if (!vtime_accounting_enabled()) {
 933                 if (utimescaled)
 934                         *utimescaled = t->utimescaled;
 935                 if (stimescaled)
 936                         *stimescaled = t->stimescaled;
 937                 return;
 938         }
 939
 940         fetch_task_cputime(t, utimescaled, stimescaled,
 941                            &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
 942         if (utimescaled)
 943                 *utimescaled += cputime_to_scaled(udelta);
 944         if (stimescaled)
 945                 *stimescaled += cputime_to_scaled(sdelta);
 946 }
 947 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */