sched, trace: Add a tracepoint for IPI-less remote wakeups
[cascardo/linux.git] / kernel / sched / core.c
index 092e511..e4c0ddd 100644 (file)
@@ -564,6 +564,8 @@ void resched_task(struct task_struct *p)
 
        if (set_nr_and_not_polling(p))
                smp_send_reschedule(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
 }
 
 void resched_cpu(int cpu)
@@ -647,6 +649,8 @@ static void wake_up_idle_cpu(int cpu)
        smp_mb();
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
 }
 
 static bool wake_up_full_nohz_cpu(int cpu)
@@ -872,7 +876,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->clock_task += delta;
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+       if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
                sched_rt_avg_update(rq, irq_delta + steal);
 #endif
 }
@@ -3033,7 +3037,7 @@ EXPORT_SYMBOL(set_user_nice);
 int can_nice(const struct task_struct *p, const int nice)
 {
        /* convert nice value [19,-20] to rlimit style value [1,40] */
-       int nice_rlim = 20 - nice;
+       int nice_rlim = nice_to_rlimit(nice);
 
        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
@@ -3057,17 +3061,10 @@ SYSCALL_DEFINE1(nice, int, increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-       if (increment < -40)
-               increment = -40;
-       if (increment > 40)
-               increment = 40;
-
+       increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
        nice = task_nice(current) + increment;
-       if (nice < MIN_NICE)
-               nice = MIN_NICE;
-       if (nice > MAX_NICE)
-               nice = MAX_NICE;
 
+       nice = clamp_val(nice, MIN_NICE, MAX_NICE);
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
 
@@ -3226,17 +3223,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
  * We ask for the deadline not being zero, and greater or equal
  * than the runtime, as well as the period of being zero or
  * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
  */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
-       return attr && attr->sched_deadline != 0 &&
-               (attr->sched_period == 0 ||
-               (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
-               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
-               attr->sched_runtime >= (2 << (DL_SCALE - 1));
+       /* deadline != 0 */
+       if (attr->sched_deadline == 0)
+               return false;
+
+       /*
+        * Since we truncate DL_SCALE bits, make sure we're at least
+        * that big.
+        */
+       if (attr->sched_runtime < (1ULL << DL_SCALE))
+               return false;
+
+       /*
+        * Since we use the MSB for wrap-around and sign issues, make
+        * sure it's not set (mind that period can be equal to zero).
+        */
+       if (attr->sched_deadline & (1ULL << 63) ||
+           attr->sched_period & (1ULL << 63))
+               return false;
+
+       /* runtime <= deadline <= period (if period != 0) */
+       if ((attr->sched_period != 0 &&
+            attr->sched_period < attr->sched_deadline) ||
+           attr->sched_deadline < attr->sched_runtime)
+               return false;
+
+       return true;
 }
 
 /*
@@ -3634,13 +3654,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         */
        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 
-out:
-       return ret;
+       return 0;
 
 err_size:
        put_user(sizeof(*attr), &uattr->size);
-       ret = -E2BIG;
-       goto out;
+       return -E2BIG;
 }
 
 /**
@@ -3689,8 +3707,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
        if (!uattr || pid < 0 || flags)
                return -EINVAL;
 
-       if (sched_copy_attr(uattr, &attr))
-               return -EFAULT;
+       retval = sched_copy_attr(uattr, &attr);
+       if (retval)
+               return retval;
+
+       if (attr.sched_policy < 0)
+               return -EINVAL;
 
        rcu_read_lock();
        retval = -ESRCH;
@@ -3740,7 +3762,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
  */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
-       struct sched_param lp;
+       struct sched_param lp = { .sched_priority = 0 };
        struct task_struct *p;
        int retval;
 
@@ -3757,11 +3779,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
 
-       if (task_has_dl_policy(p)) {
-               retval = -EINVAL;
-               goto out_unlock;
-       }
-       lp.sched_priority = p->rt_priority;
+       if (task_has_rt_policy(p))
+               lp.sched_priority = p->rt_priority;
        rcu_read_unlock();
 
        /*
@@ -3799,7 +3818,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
 
                for (; addr < end; addr++) {
                        if (*addr)
-                               goto err_size;
+                               return -EFBIG;
                }
 
                attr->size = usize;
@@ -3809,12 +3828,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
        if (ret)
                return -EFAULT;
 
-out:
-       return ret;
-
-err_size:
-       ret = -E2BIG;
-       goto out;
+       return 0;
 }
 
 /**
@@ -4185,7 +4199,7 @@ EXPORT_SYMBOL(yield);
  *     false (0) if we failed to boost the target.
  *     -ESRCH if there's no task to yield to.
  */
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
 {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
@@ -5079,11 +5093,20 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
 
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+       int cpu = smp_processor_id();
+       struct rq *rq = cpu_rq(cpu);
+       rq->age_stamp = sched_clock_cpu(cpu);
+}
+
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
+               set_cpu_rq_start_time();
+               return NOTIFY_OK;
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5202,14 +5225,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                }
 
                /*
-                * Even though we initialize ->power to something semi-sane,
-                * we leave power_orig unset. This allows us to detect if
+                * Even though we initialize ->capacity to something semi-sane,
+                * we leave capacity_orig unset. This allows us to detect if
                 * domain iteration is still funny without causing /0 traps.
                 */
-               if (!group->sgp->power_orig) {
+               if (!group->sgc->capacity_orig) {
                        printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: domain->cpu_power not "
-                                       "set\n");
+                       printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
                        break;
                }
 
@@ -5231,9 +5253,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
                printk(KERN_CONT " %s", str);
-               if (group->sgp->power != SCHED_POWER_SCALE) {
-                       printk(KERN_CONT " (cpu_power = %d)",
-                               group->sgp->power);
+               if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+                       printk(KERN_CONT " (cpu_capacity = %d)",
+                               group->sgc->capacity);
                }
 
                group = group->next;
@@ -5291,7 +5313,7 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_NEWIDLE |
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
-                        SD_SHARE_CPUPOWER |
+                        SD_SHARE_CPUCAPACITY |
                         SD_SHARE_PKG_RESOURCES |
                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
@@ -5322,7 +5344,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
-                               SD_SHARE_CPUPOWER |
+                               SD_SHARE_CPUCAPACITY |
                                SD_SHARE_PKG_RESOURCES |
                                SD_PREFER_SIBLING |
                                SD_SHARE_POWERDOMAIN);
@@ -5447,7 +5469,7 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
 
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
 {
        struct sched_group *tmp, *first;
 
@@ -5458,8 +5480,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
        do {
                tmp = sg->next;
 
-               if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
-                       kfree(sg->sgp);
+               if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+                       kfree(sg->sgc);
 
                kfree(sg);
                sg = tmp;
@@ -5477,7 +5499,7 @@ static void free_sched_domain(struct rcu_head *rcu)
        if (sd->flags & SD_OVERLAP) {
                free_sched_groups(sd->groups, 1);
        } else if (atomic_dec_and_test(&sd->groups->ref)) {
-               kfree(sd->groups->sgp);
+               kfree(sd->groups->sgc);
                kfree(sd->groups);
        }
        kfree(sd);
@@ -5688,17 +5710,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
                cpumask_or(covered, covered, sg_span);
 
-               sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-               if (atomic_inc_return(&sg->sgp->ref) == 1)
+               sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+               if (atomic_inc_return(&sg->sgc->ref) == 1)
                        build_group_mask(sd, sg);
 
                /*
-                * Initialize sgp->power such that even if we mess up the
+                * Initialize sgc->capacity such that even if we mess up the
                 * domains and no possible iteration will get us here, we won't
                 * die on a /0 trap.
                 */
-               sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
-               sg->sgp->power_orig = sg->sgp->power;
+               sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+               sg->sgc->capacity_orig = sg->sgc->capacity;
 
                /*
                 * Make sure the first group of this domain contains the
@@ -5736,8 +5758,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 
        if (sg) {
                *sg = *per_cpu_ptr(sdd->sg, cpu);
-               (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
-               atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+               (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+               atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
        }
 
        return cpu;
@@ -5746,7 +5768,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 /*
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
  *
  * Assumes the sched_domain tree is fully constructed
  */
@@ -5778,8 +5800,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
 
                group = get_group(i, sdd, &sg);
-               cpumask_clear(sched_group_cpus(sg));
-               sg->sgp->power = 0;
                cpumask_setall(sched_group_mask(sg));
 
                for_each_cpu(j, span) {
@@ -5802,16 +5822,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 }
 
 /*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
  *
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
  */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
        struct sched_group *sg = sd->groups;
 
@@ -5825,8 +5845,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_balance_cpu(sg))
                return;
 
-       update_group_power(sd, cpu);
-       atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+       update_group_capacity(sd, cpu);
+       atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
 /*
@@ -5917,8 +5937,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
 
-       if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-               *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+       if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+               *per_cpu_ptr(sdd->sgc, cpu) = NULL;
 }
 
 #ifdef CONFIG_NUMA
@@ -5931,7 +5951,7 @@ static int sched_domains_curr_level;
 /*
  * SD_flags allowed in topology descriptions.
  *
- * SD_SHARE_CPUPOWER      - describes SMT topologies
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
  * SD_SHARE_PKG_RESOURCES - describes shared caches
  * SD_NUMA                - describes NUMA topologies
  * SD_SHARE_POWERDOMAIN   - describes shared power domain
@@ -5940,7 +5960,7 @@ static int sched_domains_curr_level;
  * SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS              \
-       (SD_SHARE_CPUPOWER |            \
+       (SD_SHARE_CPUCAPACITY |         \
         SD_SHARE_PKG_RESOURCES |       \
         SD_NUMA |                      \
         SD_ASYM_PACKING |              \
@@ -5986,7 +6006,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
                                        | 1*SD_WAKE_AFFINE
-                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_SHARE_CPUCAPACITY
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 0*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
@@ -6008,7 +6028,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
         * Convert topological properties into behaviour.
         */
 
-       if (sd->flags & SD_SHARE_CPUPOWER) {
+       if (sd->flags & SD_SHARE_CPUCAPACITY) {
                sd->imbalance_pct = 110;
                sd->smt_gain = 1178; /* ~15% */
 
@@ -6215,7 +6235,7 @@ static void sched_init_numa(void)
        /* Compute default topology size */
        for (i = 0; sched_domain_topology[i].mask; i++);
 
-       tl = kzalloc((i + level) *
+       tl = kzalloc((i + level + 1) *
                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
        if (!tl)
                return;
@@ -6320,14 +6340,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sg)
                        return -ENOMEM;
 
-               sdd->sgp = alloc_percpu(struct sched_group_power *);
-               if (!sdd->sgp)
+               sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+               if (!sdd->sgc)
                        return -ENOMEM;
 
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
                        struct sched_group *sg;
-                       struct sched_group_power *sgp;
+                       struct sched_group_capacity *sgc;
 
                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
@@ -6345,12 +6365,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
                        *per_cpu_ptr(sdd->sg, j) = sg;
 
-                       sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+                       sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
-                       if (!sgp)
+                       if (!sgc)
                                return -ENOMEM;
 
-                       *per_cpu_ptr(sdd->sgp, j) = sgp;
+                       *per_cpu_ptr(sdd->sgc, j) = sgc;
                }
        }
 
@@ -6377,15 +6397,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
                        if (sdd->sg)
                                kfree(*per_cpu_ptr(sdd->sg, j));
-                       if (sdd->sgp)
-                               kfree(*per_cpu_ptr(sdd->sgp, j));
+                       if (sdd->sgc)
+                               kfree(*per_cpu_ptr(sdd->sgc, j));
                }
                free_percpu(sdd->sd);
                sdd->sd = NULL;
                free_percpu(sdd->sg);
                sdd->sg = NULL;
-               free_percpu(sdd->sgp);
-               sdd->sgp = NULL;
+               free_percpu(sdd->sgc);
+               sdd->sgc = NULL;
        }
 }
 
@@ -6455,14 +6475,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                }
        }
 
-       /* Calculate CPU power for physical packages and nodes */
+       /* Calculate CPU capacity for physical packages and nodes */
        for (i = nr_cpumask_bits-1; i >= 0; i--) {
                if (!cpumask_test_cpu(i, cpu_map))
                        continue;
 
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                        claim_allocations(i, sd);
-                       init_sched_groups_power(i, sd);
+                       init_sched_groups_capacity(i, sd);
                }
        }
 
@@ -6905,7 +6925,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-               rq->cpu_power = SCHED_POWER_SCALE;
+               rq->cpu_capacity = SCHED_CAPACITY_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -6963,6 +6983,7 @@ void __init sched_init(void)
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
        idle_thread_set_boot_cpu();
+       set_cpu_rq_start_time();
 #endif
        init_sched_fair_class();