Merge branch 'sched/urgent' into sched/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)

committer Ingo Molnar <mingo@kernel.org>

Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
author Ingo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
committer Ingo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index eea2a6f..1ef5e48 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
         if (!has_steal_clock)
                 return;
  
-       memset(st, 0, sizeof(*st));
-
         wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
         pr_info("kvm-stealtime: cpu %d, msr %llx\n",
                 cpu, (unsigned long long) slow_virt_to_phys(st));
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 253538f..b45acfd 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t)
                 __put_task_struct(t);
  }
  
+struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+struct task_struct *try_get_task_struct(struct task_struct **ptask);
+
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  extern void task_cputime(struct task_struct *t,
                          cputime_t *utime, cputime_t *stime);
diff --git a/kernel/exit.c b/kernel/exit.c

index 9e6e135..2fb4d44 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -210,6 +210,82 @@ repeat:
                 goto repeat;
  }
  
+/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+       struct sighand_struct *sighand;
+       struct task_struct *task;
+
+       /*
+        * We need to verify that release_task() was not called and thus
+        * delayed_put_task_struct() can't run and drop the last reference
+        * before rcu_read_unlock(). We check task->sighand != NULL,
+        * but we can read the already freed and reused memory.
+        */
+retry:
+       task = rcu_dereference(*ptask);
+       if (!task)
+               return NULL;
+
+       probe_kernel_address(&task->sighand, sighand);
+
+       /*
+        * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+        * was already freed we can not miss the preceding update of this
+        * pointer.
+        */
+       smp_rmb();
+       if (unlikely(task != READ_ONCE(*ptask)))
+               goto retry;
+
+       /*
+        * We've re-checked that "task == *ptask", now we have two different
+        * cases:
+        *
+        * 1. This is actually the same task/task_struct. In this case
+        *    sighand != NULL tells us it is still alive.
+        *
+        * 2. This is another task which got the same memory for task_struct.
+        *    We can't know this of course, and we can not trust
+        *    sighand != NULL.
+        *
+        *    In this case we actually return a random value, but this is
+        *    correct.
+        *
+        *    If we return NULL - we can pretend that we actually noticed that
+        *    *ptask was updated when the previous task has exited. Or pretend
+        *    that probe_slab_address(&sighand) reads NULL.
+        *
+        *    If we return the new task (because sighand is not NULL for any
+        *    reason) - this is fine too. This (new) task can't go away before
+        *    another gp pass.
+        *
+        *    And note: We could even eliminate the false positive if re-read
+        *    task->sighand once again to avoid the falsely NULL. But this case
+        *    is very unlikely so we don't care.
+        */
+       if (!sighand)
+               return NULL;
+
+       return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+       struct task_struct *task;
+
+       rcu_read_lock();
+       task = task_rcu_dereference(ptask);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+
+       return task;
+}
+
  /*
   * Determine if a process group is "orphaned", according to the POSIX
   * definition in 2.2.2.52.  Orphaned process groups are not to be affected
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 51d7105..e406ba0 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7231,7 +7231,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
         struct rq *rq = cpu_rq(cpu);
  
         rq->calc_load_update = calc_load_update;
-       account_reset_rq(rq);
         update_max_interval();
  }
  
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 75f98c5..3d60e5d 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime)
                 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
  }
  
-static __always_inline bool steal_account_process_tick(void)
+static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
  {
  #ifdef CONFIG_PARAVIRT
         if (static_key_false(&paravirt_steal_enabled)) {
@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void)
                  * time in jiffies. Lets cast the result to jiffies
                  * granularity and account the rest on the next rounds.
                  */
-               steal_jiffies = nsecs_to_jiffies(steal);
+               steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
                 this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
  
                 account_steal_time(jiffies_to_cputime(steal_jiffies));
                 return steal_jiffies;
         }
  #endif
-       return false;
+       return 0;
  }
  
  /*
@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
         u64 cputime = (__force u64) cputime_one_jiffy;
         u64 *cpustat = kcpustat_this_cpu->cpustat;
  
-       if (steal_account_process_tick())
+       if (steal_account_process_tick(ULONG_MAX))
                 return;
  
         cputime *= ticks;
@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                 return;
         }
  
-       if (steal_account_process_tick())
+       if (steal_account_process_tick(ULONG_MAX))
                 return;
  
         if (user_tick)
@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
  static cputime_t get_vtime_delta(struct task_struct *tsk)
  {
         unsigned long now = READ_ONCE(jiffies);
-       unsigned long delta = now - tsk->vtime_snap;
+       unsigned long delta_jiffies, steal_jiffies;
  
+       delta_jiffies = now - tsk->vtime_snap;
+       steal_jiffies = steal_account_process_tick(delta_jiffies);
         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
         tsk->vtime_snap = now;
  
-       return jiffies_to_cputime(delta);
+       return jiffies_to_cputime(delta_jiffies - steal_jiffies);
  }
  
  static void __vtime_account_system(struct task_struct *tsk)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 0368c39..2a0a999 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
  
         nr_switches = p->nvcsw + p->nivcsw;
  
-#ifdef CONFIG_SCHEDSTATS
         P(se.nr_migrations);
  
+#ifdef CONFIG_SCHEDSTATS
         if (schedstat_enabled()) {
                 u64 avg_atom, avg_per_cpu;
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c8c5d2d..7306356 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1303,6 +1303,8 @@ static void task_numa_assign(struct task_numa_env *env,
  {
         if (env->best_task)
                 put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
  
         env->best_task = p;
         env->best_imp = imp;
@@ -1370,31 +1372,11 @@ static void task_numa_compare(struct task_numa_env *env,
         long imp = env->p->numa_group ? groupimp : taskimp;
         long moveimp = imp;
         int dist = env->dist;
-       bool assigned = false;
  
         rcu_read_lock();
-
-       raw_spin_lock_irq(&dst_rq->lock);
-       cur = dst_rq->curr;
-       /*
-        * No need to move the exiting task or idle task.
-        */
-       if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+       cur = task_rcu_dereference(&dst_rq->curr);
+       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
                 cur = NULL;
-       else {
-               /*
-                * The task_struct must be protected here to protect the
-                * p->numa_faults access in the task_weight since the
-                * numa_faults could already be freed in the following path:
-                * finish_task_switch()
-                *     --> put_task_struct()
-                *         --> __put_task_struct()
-                *             --> task_numa_free()
-                */
-               get_task_struct(cur);
-       }
-
-       raw_spin_unlock_irq(&dst_rq->lock);
  
         /*
          * Because we have preemption enabled we can get migrated around and
@@ -1477,7 +1459,6 @@ balance:
                  */
                 if (!load_too_imbalanced(src_load, dst_load, env)) {
                         imp = moveimp - 1;
-                       put_task_struct(cur);
                         cur = NULL;
                         goto assign;
                 }
@@ -1503,16 +1484,9 @@ balance:
                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  
  assign:
-       assigned = true;
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
-       /*
-        * The dst_rq->curr isn't assigned. The protection for task_struct is
-        * finished.
-        */
-       if (cur && !assigned)
-               put_task_struct(cur);
  }
  
  static void task_numa_find_cpu(struct task_numa_env *env,
@@ -3698,7 +3672,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
         if (unlikely(cfs_rq->throttle_count))
-               return cfs_rq->throttled_clock_task;
+               return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
  
         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
  }
@@ -3836,13 +3810,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
         cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
         if (!cfs_rq->throttle_count) {
                 /* adjust cfs_rq_clock_task() */
                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                              cfs_rq->throttled_clock_task;
         }
-#endif
  
         return 0;
  }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index c5aeedf..9fb873c 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -201,6 +201,8 @@ exit_idle:
   */
  static void cpu_idle_loop(void)
  {
+       int cpu = smp_processor_id();
+
         while (1) {
                 /*
                  * If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
                         check_pgt_cache();
                         rmb();
  
-                       if (cpu_is_offline(smp_processor_id())) {
+                       if (cpu_is_offline(cpu)) {
                                 cpuhp_report_idle_dead();
                                 arch_cpu_idle_dead();
                         }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 7cbeb92..71ce986 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1809,16 +1809,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
  #else /* arch_scale_freq_capacity */
  #define arch_scale_freq_invariant()    (false)
  #endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-       rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
-       rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       rq->prev_steal_time_rq = 0;
-#endif
-}
author	Ingo Molnar <mingo@kernel.org>
	Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
arch/x86/kernel/kvm.c		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cputime.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/idle.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history