Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[cascardo/linux.git] / kernel / sched / cputime.c
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 9858266..5ebee31 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
   * task when irq is in progress while we read rq->clock. That is a worthy
   * compromise in place of having locks on each irq in account_system_time.
   */
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  
-static DEFINE_PER_CPU(u64, irq_start_time);
  static int sched_clock_irqtime;
  
  void enable_sched_clock_irqtime(void)
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
         sched_clock_irqtime = 0;
  }
  
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
-
  /*
   * Called before incrementing preempt_count on {soft,}irq_enter
   * and before decrementing preempt_count on {soft,}irq_exit.
   */
  void irqtime_account_irq(struct task_struct *curr)
  {
+       struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
         s64 delta;
         int cpu;
  
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
                 return;
  
         cpu = smp_processor_id();
-       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-       __this_cpu_add(irq_start_time, delta);
+       delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+       irqtime->irq_start_time += delta;
  
-       irq_time_write_begin();
+       u64_stats_update_begin(&irqtime->sync);
         /*
          * We do not account for softirq time from ksoftirqd here.
          * We want to continue accounting softirq time to ksoftirqd thread
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
          * that do not consume any time, but still wants to run.
          */
         if (hardirq_count())
-               __this_cpu_add(cpu_hardirq_time, delta);
+               irqtime->hardirq_time += delta;
         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-               __this_cpu_add(cpu_softirq_time, delta);
+               irqtime->softirq_time += delta;
  
-       irq_time_write_end();
+       u64_stats_update_end(&irqtime->sync);
  }
  EXPORT_SYMBOL_GPL(irqtime_account_irq);
  
-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
+static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
  {
         u64 *cpustat = kcpustat_this_cpu->cpustat;
-       unsigned long flags;
         cputime_t irq_cputime;
  
-       local_irq_save(flags);
-       irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
-                     cpustat[CPUTIME_IRQ];
+       irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
         irq_cputime = min(irq_cputime, maxtime);
-       cpustat[CPUTIME_IRQ] += irq_cputime;
-       local_irq_restore(flags);
+       cpustat[idx] += irq_cputime;
+
         return irq_cputime;
  }
  
-static cputime_t irqtime_account_si_update(cputime_t maxtime)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
  {
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
-       unsigned long flags;
-       cputime_t softirq_cputime;
+       return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
+                                     CPUTIME_IRQ, maxtime);
+}
  
-       local_irq_save(flags);
-       softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
-                         cpustat[CPUTIME_SOFTIRQ];
-       softirq_cputime = min(softirq_cputime, maxtime);
-       cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
-       local_irq_restore(flags);
-       return softirq_cputime;
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
+{
+       return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
+                                     CPUTIME_SOFTIRQ, maxtime);
  }
  
  #else /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -263,6 +252,11 @@ void account_idle_time(cputime_t cputime)
                 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
  }
  
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
  static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
  {
  #ifdef CONFIG_PARAVIRT
@@ -290,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
  {
         cputime_t accounted;
  
+       /* Shall be converted to a lockdep-enabled lightweight check */
+       WARN_ON_ONCE(!irqs_disabled());
+
         accounted = steal_account_process_time(max);
  
         if (accounted < max)
@@ -301,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
         return accounted;
  }
  
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       u64 ns;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(t, &rf);
+       ns = t->se.sum_exec_runtime;
+       task_rq_unlock(rq, t, &rf);
+
+       return ns;
+}
+#endif
+
  /*
   * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
   * tasks (sum on group iteration) belonging to @tsk's group.
@@ -313,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
         unsigned int seq, nextseq;
         unsigned long flags;
  
+       /*
+        * Update current task runtime to account pending time since last
+        * scheduler action or thread_group_cputime() call. This thread group
+        * might have other running tasks on different CPUs, but updating
+        * their runtime can affect syscall performance, so we skip account
+        * those pending times and rely only on values updated on tick or
+        * other scheduler action.
+        */
+       if (same_thread_group(current, tsk))
+               (void) task_sched_runtime(current);
+
         rcu_read_lock();
         /* Attempt a lockless read on the first round. */
         nextseq = 0;
@@ -327,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
                         task_cputime(t, &utime, &stime);
                         times->utime += utime;
                         times->stime += stime;
-                       times->sum_exec_runtime += task_sched_runtime(t);
+                       times->sum_exec_runtime += read_sum_exec_runtime(t);
                 }
                 /* If lockless access failed, take the lock. */
                 nextseq = 1;
@@ -371,7 +399,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
          * idle, or potentially user or system time. Due to rounding,
          * other time can exceed ticks occasionally.
          */
-       other = account_other_time(cputime);
+       other = account_other_time(ULONG_MAX);
         if (other >= cputime)
                 return;
         cputime -= other;
@@ -486,7 +514,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
         }
  
         cputime = cputime_one_jiffy;
-       steal = steal_account_process_time(cputime);
+       steal = steal_account_process_time(ULONG_MAX);
  
         if (steal >= cputime)
                 return;
@@ -516,7 +544,7 @@ void account_idle_ticks(unsigned long ticks)
         }
  
         cputime = jiffies_to_cputime(ticks);
-       steal = steal_account_process_time(cputime);
+       steal = steal_account_process_time(ULONG_MAX);
  
         if (steal >= cputime)
                 return;
@@ -614,19 +642,25 @@ static void cputime_adjust(struct task_cputime *curr,
         stime = curr->stime;
         utime = curr->utime;
  
-       if (utime == 0) {
-               stime = rtime;
+       /*
+        * If either stime or both stime and utime are 0, assume all runtime is
+        * userspace. Once a task gets some ticks, the monotonicy code at
+        * 'update' will ensure things converge to the observed ratio.
+        */
+       if (stime == 0) {
+               utime = rtime;
                 goto update;
         }
  
-       if (stime == 0) {
-               utime = rtime;
+       if (utime == 0) {
+               stime = rtime;
                 goto update;
         }
  
         stime = scale_stime((__force u64)stime, (__force u64)rtime,
                             (__force u64)(stime + utime));
  
+update:
         /*
          * Make sure stime doesn't go backwards; this preserves monotonicity
          * for utime because rtime is monotonic.
@@ -649,7 +683,6 @@ static void cputime_adjust(struct task_cputime *curr,
                 stime = rtime - utime;
         }
  
-update:
         prev->stime = stime;
         prev->utime = utime;
  out:
@@ -694,6 +727,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
         unsigned long now = READ_ONCE(jiffies);
         cputime_t delta, other;
  
+       /*
+        * Unlike tick based timing, vtime based timing never has lost
+        * ticks, and no need for steal time accounting to make up for
+        * lost ticks. Vtime accounts a rounded version of actual
+        * elapsed time. Limit account_other_time to prevent rounding
+        * errors from causing elapsed vtime to go negative.
+        */
         delta = jiffies_to_cputime(now - tsk->vtime_snap);
         other = account_other_time(delta);
         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);