Merge branch 'sched/urgent' into sched/core, to pick up fixes
authorIngo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
committerIngo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/kernel/kvm.c
include/linux/sched.h
kernel/exit.c
kernel/sched/core.c
kernel/sched/cputime.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/idle.c
kernel/sched/sched.h

index eea2a6f..1ef5e48 100644 (file)
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
        if (!has_steal_clock)
                return;
 
-       memset(st, 0, sizeof(*st));
-
        wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
        pr_info("kvm-stealtime: cpu %d, msr %llx\n",
                cpu, (unsigned long long) slow_virt_to_phys(st));
index 253538f..b45acfd 100644 (file)
@@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t)
                __put_task_struct(t);
 }
 
+struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+struct task_struct *try_get_task_struct(struct task_struct **ptask);
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
                         cputime_t *utime, cputime_t *stime);
index 9e6e135..2fb4d44 100644 (file)
@@ -210,6 +210,82 @@ repeat:
                goto repeat;
 }
 
+/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+       struct sighand_struct *sighand;
+       struct task_struct *task;
+
+       /*
+        * We need to verify that release_task() was not called and thus
+        * delayed_put_task_struct() can't run and drop the last reference
+        * before rcu_read_unlock(). We check task->sighand != NULL,
+        * but we can read the already freed and reused memory.
+        */
+retry:
+       task = rcu_dereference(*ptask);
+       if (!task)
+               return NULL;
+
+       probe_kernel_address(&task->sighand, sighand);
+
+       /*
+        * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+        * was already freed we can not miss the preceding update of this
+        * pointer.
+        */
+       smp_rmb();
+       if (unlikely(task != READ_ONCE(*ptask)))
+               goto retry;
+
+       /*
+        * We've re-checked that "task == *ptask", now we have two different
+        * cases:
+        *
+        * 1. This is actually the same task/task_struct. In this case
+        *    sighand != NULL tells us it is still alive.
+        *
+        * 2. This is another task which got the same memory for task_struct.
+        *    We can't know this of course, and we can not trust
+        *    sighand != NULL.
+        *
+        *    In this case we actually return a random value, but this is
+        *    correct.
+        *
+        *    If we return NULL - we can pretend that we actually noticed that
+        *    *ptask was updated when the previous task has exited. Or pretend
+        *    that probe_slab_address(&sighand) reads NULL.
+        *
+        *    If we return the new task (because sighand is not NULL for any
+        *    reason) - this is fine too. This (new) task can't go away before
+        *    another gp pass.
+        *
+        *    And note: We could even eliminate the false positive if re-read
+        *    task->sighand once again to avoid the falsely NULL. But this case
+        *    is very unlikely so we don't care.
+        */
+       if (!sighand)
+               return NULL;
+
+       return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+       struct task_struct *task;
+
+       rcu_read_lock();
+       task = task_rcu_dereference(ptask);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+
+       return task;
+}
+
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
index 51d7105..e406ba0 100644 (file)
@@ -7231,7 +7231,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
        struct rq *rq = cpu_rq(cpu);
 
        rq->calc_load_update = calc_load_update;
-       account_reset_rq(rq);
        update_max_interval();
 }
 
index 75f98c5..3d60e5d 100644 (file)
@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime)
                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 
-static __always_inline bool steal_account_process_tick(void)
+static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
 {
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void)
                 * time in jiffies. Lets cast the result to jiffies
                 * granularity and account the rest on the next rounds.
                 */
-               steal_jiffies = nsecs_to_jiffies(steal);
+               steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
                this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
 
                account_steal_time(jiffies_to_cputime(steal_jiffies));
                return steal_jiffies;
        }
 #endif
-       return false;
+       return 0;
 }
 
 /*
@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
        u64 cputime = (__force u64) cputime_one_jiffy;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-       if (steal_account_process_tick())
+       if (steal_account_process_tick(ULONG_MAX))
                return;
 
        cputime *= ticks;
@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
 
-       if (steal_account_process_tick())
+       if (steal_account_process_tick(ULONG_MAX))
                return;
 
        if (user_tick)
@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
        unsigned long now = READ_ONCE(jiffies);
-       unsigned long delta = now - tsk->vtime_snap;
+       unsigned long delta_jiffies, steal_jiffies;
 
+       delta_jiffies = now - tsk->vtime_snap;
+       steal_jiffies = steal_account_process_tick(delta_jiffies);
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
        tsk->vtime_snap = now;
 
-       return jiffies_to_cputime(delta);
+       return jiffies_to_cputime(delta_jiffies - steal_jiffies);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
index 0368c39..2a0a999 100644 (file)
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
        nr_switches = p->nvcsw + p->nivcsw;
 
-#ifdef CONFIG_SCHEDSTATS
        P(se.nr_migrations);
 
+#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
 
index c8c5d2d..7306356 100644 (file)
@@ -1303,6 +1303,8 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        if (env->best_task)
                put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
 
        env->best_task = p;
        env->best_imp = imp;
@@ -1370,31 +1372,11 @@ static void task_numa_compare(struct task_numa_env *env,
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
-       bool assigned = false;
 
        rcu_read_lock();
-
-       raw_spin_lock_irq(&dst_rq->lock);
-       cur = dst_rq->curr;
-       /*
-        * No need to move the exiting task or idle task.
-        */
-       if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+       cur = task_rcu_dereference(&dst_rq->curr);
+       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
                cur = NULL;
-       else {
-               /*
-                * The task_struct must be protected here to protect the
-                * p->numa_faults access in the task_weight since the
-                * numa_faults could already be freed in the following path:
-                * finish_task_switch()
-                *     --> put_task_struct()
-                *         --> __put_task_struct()
-                *             --> task_numa_free()
-                */
-               get_task_struct(cur);
-       }
-
-       raw_spin_unlock_irq(&dst_rq->lock);
 
        /*
         * Because we have preemption enabled we can get migrated around and
@@ -1477,7 +1459,6 @@ balance:
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
-                       put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
@@ -1503,16 +1484,9 @@ balance:
                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 
 assign:
-       assigned = true;
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
-       /*
-        * The dst_rq->curr isn't assigned. The protection for task_struct is
-        * finished.
-        */
-       if (cur && !assigned)
-               put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -3698,7 +3672,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
        if (unlikely(cfs_rq->throttle_count))
-               return cfs_rq->throttled_clock_task;
+               return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
 
        return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 }
@@ -3836,13 +3810,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
        cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
                /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
        }
-#endif
 
        return 0;
 }
index c5aeedf..9fb873c 100644 (file)
@@ -201,6 +201,8 @@ exit_idle:
  */
 static void cpu_idle_loop(void)
 {
+       int cpu = smp_processor_id();
+
        while (1) {
                /*
                 * If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
                        check_pgt_cache();
                        rmb();
 
-                       if (cpu_is_offline(smp_processor_id())) {
+                       if (cpu_is_offline(cpu)) {
                                cpuhp_report_idle_dead();
                                arch_cpu_idle_dead();
                        }
index 7cbeb92..71ce986 100644 (file)
@@ -1809,16 +1809,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
 #else /* arch_scale_freq_capacity */
 #define arch_scale_freq_invariant()    (false)
 #endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-       rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
-       rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       rq->prev_steal_time_rq = 0;
-#endif
-}