#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
/*
#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+
/*
* These are the 'tuning knobs' of the scheduler:
*
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
}
/*
- * These are the runqueue data structures:
+ * This is the priority-queue data structure of the RT scheduling class:
*/
+struct rt_prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ struct list_head queue[MAX_RT_PRIO];
+};
+struct load_stat {
+ struct load_weight load;
+ u64 load_update_start, load_update_last;
+ unsigned long delta_fair, delta_exec, delta_stat;
+};
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+ struct load_weight load;
+ unsigned long nr_running;
+
+ s64 fair_clock;
+ u64 exec_clock;
+ s64 wait_runtime;
+ u64 sleeper_bonus;
+ unsigned long wait_runtime_overruns, wait_runtime_underruns;
+
+ struct rb_root tasks_timeline;
+ struct rb_node *rb_leftmost;
+ struct rb_node *rb_load_balance_curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr;
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+
+ /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
+ */
+ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+#endif
+};
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+ struct rt_prio_array active;
+ int rt_load_balance_idx;
+ struct list_head *rt_load_balance_head, *rt_load_balance_curr;
+};
+
+/*
+ * The prio-array type of the old scheduler:
+ */
struct prio_array {
unsigned int nr_active;
DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
- spinlock_t lock;
+ spinlock_t lock; /* runqueue lock */
/*
* nr_running and cpu_load should be in the same cacheline because
*/
unsigned long nr_running;
unsigned long raw_weighted_load;
-#ifdef CONFIG_SMP
- unsigned long cpu_load[3];
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+ unsigned char in_nohz_recently;
#endif
- unsigned long long nr_switches;
+ struct load_stat ls; /* capture load from *all* tasks on this cpu */
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
+#endif
+ struct rt_rq rt;
/*
* This is part of a global counter where only the total sum
unsigned long nr_uninterruptible;
unsigned long expired_timestamp;
- /* Cached timestamp set by update_cpu_clock() */
unsigned long long most_recent_timestamp;
+
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
+
struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
+
+ u64 clock, prev_clock_raw;
+ s64 clock_max_delta;
+
+ unsigned int clock_warps, clock_overflows;
+ unsigned int clock_unstable_events;
+
+ struct sched_class *load_balance_class;
+
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline int cpu_of(struct rq *rq)
{
spin_unlock_irqrestore(&rq->lock, *flags);
}
-#ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 14
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
- int cpu;
-
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
- struct sched_domain *sd;
- int dcnt = 0;
-#endif
-
- /* runqueue-specific stats */
- seq_printf(seq,
- "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
- cpu, rq->yld_both_empty,
- rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
- rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
- rq->ttwu_cnt, rq->ttwu_local,
- rq->rq_sched_info.cpu_time,
- rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
-
- seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
- /* domain-specific stats */
- preempt_disable();
- for_each_domain(cpu, sd) {
- enum idle_type itype;
- char mask_str[NR_CPUS];
-
- cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
- seq_printf(seq, "domain%d %s", dcnt++, mask_str);
- for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
- "%lu",
- sd->lb_cnt[itype],
- sd->lb_balanced[itype],
- sd->lb_failed[itype],
- sd->lb_imbalance[itype],
- sd->lb_gained[itype],
- sd->lb_hot_gained[itype],
- sd->lb_nobusyq[itype],
- sd->lb_nobusyg[itype]);
- }
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
- " %lu %lu %lu\n",
- sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
- sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
- sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine,
- sd->ttwu_move_balance);
- }
- preempt_enable();
-#endif
- }
- return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
-
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
-}
-
-const struct file_operations proc_schedstat_operations = {
- .open = schedstat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
-{
- if (rq) {
- rq->rq_sched_info.run_delay += delta_jiffies;
- rq->rq_sched_info.pcnt++;
- }
-}
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
-{
- if (rq)
- rq->rq_sched_info.cpu_time += delta_jiffies;
-}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
-{}
-# define schedstat_inc(rq, field) do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-#endif
-
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
return rq;
}
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-/*
- * Called when a process is dequeued from the active array and given
- * the cpu. We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue. (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
- */
-static inline void sched_info_dequeued(struct task_struct *t)
-{
- t->sched_info.last_queued = 0;
-}
-
-/*
- * Called when a task finally hits the cpu. We can now calculate how
- * long it was waiting to run. We also note when it began so that we
- * can keep stats on how long its timeslice is.
- */
-static void sched_info_arrive(struct task_struct *t)
-{
- unsigned long now = jiffies, delta_jiffies = 0;
-
- if (t->sched_info.last_queued)
- delta_jiffies = now - t->sched_info.last_queued;
- sched_info_dequeued(t);
- t->sched_info.run_delay += delta_jiffies;
- t->sched_info.last_arrival = now;
- t->sched_info.pcnt++;
-
- rq_sched_info_arrive(task_rq(t), delta_jiffies);
-}
-
-/*
- * Called when a process is queued into either the active or expired
- * array. The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu. Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
-{
- if (unlikely(sched_info_on()))
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = jiffies;
-}
-
-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily. Now we can calculate how long we ran.
- */
-static inline void sched_info_depart(struct task_struct *t)
-{
- unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
-
- t->sched_info.cpu_time += delta_jiffies;
- rq_sched_info_depart(task_rq(t), delta_jiffies);
-}
-
-/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice. (This may also be called when switching to or from
- * the idle task.) We are only called when prev != next.
- */
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
- struct rq *rq = task_rq(prev);
-
- /*
- * prev now departs the cpu. It's not interesting to record
- * stats about how efficient we were at scheduling the idle
- * process, however.
- */
- if (prev != rq->idle)
- sched_info_depart(prev);
-
- if (next != rq->idle)
- sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
- if (unlikely(sched_info_on()))
- __sched_info_switch(prev, next);
-}
-#else
-#define sched_info_queued(t) do { } while (0)
-#define sched_info_switch(t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#include "sched_stats.h"
/*
* Adding/removing a task to/from a priority array:
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#else
static inline void resched_task(struct task_struct *p)
{
}
#ifdef CONFIG_SMP
+
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ task_thread_info(p)->cpu = cpu;
+}
+
struct migration_req {
struct list_head list;
{
unsigned long flags;
struct rq *rq;
- int preempted;
+ struct prio_array *array;
+ int running;
repeat:
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_running()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_running(rq, p))
+ cpu_relax();
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
rq = task_rq_lock(p, &flags);
- /* Must be off runqueue entirely, not preempted. */
- if (unlikely(p->array || task_running(rq, p))) {
- /* If it's preempted, we yield. It could be a while. */
- preempted = !task_running(rq, p);
- task_rq_unlock(rq, &flags);
+ running = task_running(rq, p);
+ array = p->array;
+ task_rq_unlock(rq, &flags);
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
cpu_relax();
- if (preempted)
- yield();
goto repeat;
}
- task_rq_unlock(rq, &flags);
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it wa still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(array)) {
+ yield();
+ goto repeat;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
}
/***
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
struct sched_domain *sd;
int i;
- if (idle_cpu(cpu))
+ /*
+ * If it is idle, then it is the best cpu to run this task.
+ *
+ * This cpu is also the best, if it has more than one task already.
+ * Siblings must be also busy(in most cases) as they didn't already
+ * pickup the extra load from this cpu and hence we need not check
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+ if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
task_rq_unlock(this_rq, &flags);
}
-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
- *
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
-void fastcall sched_exit(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
-
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
- rq = task_rq_lock(p->parent, &flags);
- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
- p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > task_timeslice(p)))
- p->parent->time_slice = task_timeslice(p);
- }
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = p->parent->sleep_avg /
- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
- (EXIT_WEIGHT + 1);
- task_rq_unlock(rq, &flags);
-}
-
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle,
+ struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
/*
*/
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
- struct sched_domain *sd, enum idle_type idle,
+ struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+ unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle,
cpumask_t *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
max_load = this_load = total_load = total_pwr = 0;
busiest_load_per_task = busiest_nr_running = 0;
this_load_per_task = this_nr_running = 0;
- if (idle == NOT_IDLE)
+ if (idle == CPU_NOT_IDLE)
load_idx = sd->busy_idx;
- else if (idle == NEWLY_IDLE)
+ else if (idle == CPU_NEWLY_IDLE)
load_idx = sd->newidle_idx;
else
load_idx = sd->idle_idx;
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
* Busy processors will not participate in power savings
* balance.
*/
- if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto group_next;
/*
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
* moving them.
*/
- pwr_now += busiest->cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;
if (this == group_leader && group_leader != group_min) {
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
-find_busiest_queue(struct sched_group *group, enum idle_type idle,
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long imbalance, cpumask_t *cpus)
{
struct rq *busiest = NULL, *rq;
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
- struct sched_domain *sd, enum idle_type idle,
+ struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
* let the state of idle sibling percolate up as IDLE, instead of
- * portraying it as NOT_IDLE.
+ * portraying it as CPU_NOT_IDLE.
*/
- if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
+ /*
+ * some other cpu did the load balance for us.
+ */
+ if (nr_moved && this_cpu != smp_processor_id())
+ resched_cpu(this_cpu);
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*
- * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
* this_rq is locked.
*/
static int
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
* let the state of idle sibling percolate up as IDLE, instead of
- * portraying it as NOT_IDLE.
+ * portraying it as CPU_NOT_IDLE.
*/
if (sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
- schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+ group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
&sd_idle, &cpus, NULL);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
goto out_balanced;
}
- busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+ busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
&cpus);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
- schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+ schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
nr_moved = 0;
if (busiest->nr_running > 1) {
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
minus_1_or_zero(busiest->nr_running),
- imbalance, sd, NEWLY_IDLE, NULL);
+ imbalance, sd, CPU_NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);
if (!nr_moved) {
}
if (!nr_moved) {
- schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
return nr_moved;
out_balanced:
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
unsigned long next_balance = jiffies + 60 * HZ;
for_each_domain(this_cpu, sd) {
- if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned long interval;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu,
- this_rq, sd);
- if (time_after(next_balance,
- sd->last_balance + sd->balance_interval))
- next_balance = sd->last_balance
- + sd->balance_interval;
- if (pulled_task)
- break;
- }
+ this_rq, sd);
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ if (time_after(next_balance, sd->last_balance + interval))
+ next_balance = sd->last_balance + interval;
+ if (pulled_task)
+ break;
}
if (!pulled_task)
/*
schedstat_inc(sd, alb_cnt);
if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
- RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
+ RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
NULL))
schedstat_inc(sd, alb_pushed);
else
}
}
+#ifdef CONFIG_NO_HZ
+static struct {
+ atomic_t load_balancer;
+ cpumask_t cpu_mask;
+} nohz ____cacheline_aligned = {
+ .load_balancer = ATOMIC_INIT(-1),
+ .cpu_mask = CPU_MASK_NONE,
+};
+
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
*
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
+
+ if (stop_tick) {
+ cpu_set(cpu, nohz.cpu_mask);
+ cpu_rq(cpu)->in_nohz_recently = 1;
+
+ /*
+ * If we are going offline and still the leader, give up!
+ */
+ if (cpu_is_offline(cpu) &&
+ atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ return 0;
+ }
+
+ /* time for ilb owner also to sleep */
+ if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ atomic_set(&nohz.load_balancer, -1);
+ return 0;
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+ return 1;
+ } else if (atomic_read(&nohz.load_balancer) == cpu)
+ return 1;
+ } else {
+ if (!cpu_isset(cpu, nohz.cpu_mask))
+ return 0;
+
+ cpu_clear(cpu, nohz.cpu_mask);
+
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ }
+ return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
- int this_cpu = smp_processor_id(), balance = 1;
- struct rq *this_rq = cpu_rq(this_cpu);
+ int balance = 1;
+ struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
- /*
- * We are idle if there are no processes running. This
- * is valid even if we are the idle process (SMT).
- */
- enum idle_type idle = !this_rq->nr_running ?
- SCHED_IDLE : NOT_IDLE;
- /* Earliest time when we have to call run_rebalance_domains again */
+ /* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
- for_each_domain(this_cpu, sd) {
+ for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
interval = sd->balance_interval;
- if (idle != SCHED_IDLE)
+ if (idle != CPU_IDLE)
interval *= sd->busy_factor;
/* scale ms to jiffies */
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
* not idle.
*/
- idle = NOT_IDLE;
+ idle = CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}
if (!balance)
break;
}
- this_rq->next_balance = next_balance;
+ rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int local_cpu = smp_processor_id();
+ struct rq *local_rq = cpu_rq(local_cpu);
+ enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;
+
+ rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * If this cpu is the owner for idle load balancing, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped.
+ */
+ if (local_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == local_cpu) {
+ cpumask_t cpus = nohz.cpu_mask;
+ struct rq *rq;
+ int balance_cpu;
+
+ cpu_clear(local_cpu, cpus);
+ for_each_cpu_mask(balance_cpu, cpus) {
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rebalance_domains(balance_cpu, CPU_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(local_rq->next_balance, rq->next_balance))
+ local_rq->next_balance = rq->next_balance;
+ }
+ }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+ /*
+ * If we were in the nohz mode recently and busy at the current
+ * scheduler tick, then check if we need to nominate new idle
+ * load balancer.
+ */
+ if (rq->in_nohz_recently && !rq->idle_at_tick) {
+ rq->in_nohz_recently = 0;
+
+ if (atomic_read(&nohz.load_balancer) == cpu) {
+ cpu_clear(cpu, nohz.cpu_mask);
+ atomic_set(&nohz.load_balancer, -1);
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /*
+ * simple selection for now: Nominate the
+ * first cpu in the nohz list to be the next
+ * ilb owner.
+ *
+ * TBD: Traverse the sched domains and nominate
+ * the nearest cpu in the nohz.cpu_mask.
+ */
+ int ilb = first_cpu(nohz.cpu_mask);
+
+ if (ilb != NR_CPUS)
+ resched_cpu(ilb);
+ }
+ }
+
+ /*
+ * If this cpu is idle and doing idle load balancing for all the
+ * cpus with ticks stopped, is it time for that to stop?
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+ cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ resched_cpu(cpu);
+ return;
+ }
+
+ /*
+ * If this cpu is idle and the idle load balancing is done by
+ * someone else, then no need raise the SCHED_SOFTIRQ
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+ cpu_isset(cpu, nohz.cpu_mask))
+ return;
+#endif
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
}
#else
/*
unsigned long long now = sched_clock();
struct task_struct *p = current;
int cpu = smp_processor_id();
+ int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
- if (p != rq->idle)
+ if (!idle_at_tick)
task_running_tick(rq, p);
#ifdef CONFIG_SMP
update_load(rq);
- if (time_after_eq(jiffies, rq->next_balance))
- raise_softirq(SCHED_SOFTIRQ);
+ rq->idle_at_tick = idle_at_tick;
+ trigger_load_balance(cpu);
#endif
}
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return -ESRCH;
}
out_unlock:
put_task_struct(p);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return retval;
}
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
retval = -ESRCH;
out_unlock:
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
if (retval)
return retval;
BUG_ON(!in_softirq());
if (need_resched() && system_state == SYSTEM_RUNNING) {
- raw_local_irq_disable();
- _local_bh_enable();
- raw_local_irq_enable();
+ local_bh_enable();
__cond_resched();
local_bh_disable();
return 1;
show_task(p);
} while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+
read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
debug_show_all_locks();
}
+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{
+ /* nothing yet */
+}
+
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
struct rq *rq;
switch (action) {
+ case CPU_LOCK_ACQUIRE:
+ mutex_lock(&sched_hotcpu_mutex);
+ break;
+
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
/* Strictly unneccessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!cpu_rq(cpu)->migration_thread)
break;
/* Unbind it from offline cpu so it can run. Fall thru. */
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
BUG_ON(rq->nr_running != 0);
/* No need to migrate the tasks: it was best-effort if
- * they didn't do lock_cpu_hotplug(). Just wake up
+ * they didn't take sched_hotcpu_mutex. Just wake up
* the requestors. */
spin_lock_irq(&rq->lock);
while (!list_empty(&rq->migration_queue)) {
spin_unlock_irq(&rq->lock);
break;
#endif
+ case CPU_LOCK_RELEASE:
+ mutex_unlock(&sched_hotcpu_mutex);
+ break;
}
return NOTIFY_OK;
}
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
#define SD_NODES_PER_DOMAIN 16
-/*
- * Self-tuning task migration cost measurement between source and target CPUs.
- *
- * This is done by measuring the cost of manipulating buffers of varying
- * sizes. For a given buffer-size here are the steps that are taken:
- *
- * 1) the source CPU reads+dirties a shared buffer
- * 2) the target CPU reads+dirties the same shared buffer
- *
- * We measure how long they take, in the following 4 scenarios:
- *
- * - source: CPU1, target: CPU2 | cost1
- * - source: CPU2, target: CPU1 | cost2
- * - source: CPU1, target: CPU1 | cost3
- * - source: CPU2, target: CPU2 | cost4
- *
- * We then calculate the cost3+cost4-cost1-cost2 difference - this is
- * the cost of migration.
- *
- * We then start off from a small buffer-size and iterate up to larger
- * buffer sizes, in 5% steps - measuring each buffer-size separately, and
- * doing a maximum search for the cost. (The maximum cost for a migration
- * normally occurs when the working set size is around the effective cache
- * size.)
- */
-#define SEARCH_SCOPE 2
-#define MIN_CACHE_SIZE (64*1024U)
-#define DEFAULT_CACHE_SIZE (5*1024*1024U)
-#define ITERATIONS 1
-#define SIZE_THRESH 130
-#define COST_THRESH 130
-
-/*
- * The migration cost is a function of 'domain distance'. Domain
- * distance is the number of steps a CPU has to iterate down its
- * domain tree to share a domain with the other CPU. The farther
- * two CPUs are from each other, the larger the distance gets.
- *
- * Note that we use the distance only to cache measurement results,
- * the distance value is not used numerically otherwise. When two
- * CPUs have the same distance it is assumed that the migration
- * cost is the same. (this is a simplification but quite practical)
- */
-#define MAX_DOMAIN_DISTANCE 32
-
-static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
- { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
-/*
- * Architectures may override the migration cost and thus avoid
- * boot-time calibration. Unit is nanoseconds. Mostly useful for
- * virtualized hardware:
- */
-#ifdef CONFIG_DEFAULT_MIGRATION_COST
- CONFIG_DEFAULT_MIGRATION_COST
-#else
- -1LL
-#endif
-};
-
-/*
- * Allow override of migration cost - in units of microseconds.
- * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
- * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
- */
-static int __init migration_cost_setup(char *str)
-{
- int ints[MAX_DOMAIN_DISTANCE+1], i;
-
- str = get_options(str, ARRAY_SIZE(ints), ints);
-
- printk("#ints: %d\n", ints[0]);
- for (i = 1; i <= ints[0]; i++) {
- migration_cost[i-1] = (unsigned long long)ints[i]*1000;
- printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
- }
- return 1;
-}
-
-__setup ("migration_cost=", migration_cost_setup);
-
-/*
- * Global multiplier (divisor) for migration-cutoff values,
- * in percentiles. E.g. use a value of 150 to get 1.5 times
- * longer cache-hot cutoff times.
- *
- * (We scale it from 100 to 128 to long long handling easier.)
- */
-
-#define MIGRATION_FACTOR_SCALE 128
-
-static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
-
-static int __init setup_migration_factor(char *str)
-{
- get_option(&str, &migration_factor);
- migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
- return 1;
-}
-
-__setup("migration_factor=", setup_migration_factor);
-
-/*
- * Estimated distance of two CPUs, measured via the number of domains
- * we have to pass for the two CPUs to be in the same span:
- */
-static unsigned long domain_distance(int cpu1, int cpu2)
-{
- unsigned long distance = 0;
- struct sched_domain *sd;
-
- for_each_domain(cpu1, sd) {
- WARN_ON(!cpu_isset(cpu1, sd->span));
- if (cpu_isset(cpu2, sd->span))
- return distance;
- distance++;
- }
- if (distance >= MAX_DOMAIN_DISTANCE) {
- WARN_ON(1);
- distance = MAX_DOMAIN_DISTANCE-1;
- }
-
- return distance;
-}
-
-static unsigned int migration_debug;
-
-static int __init setup_migration_debug(char *str)
-{
- get_option(&str, &migration_debug);
- return 1;
-}
-
-__setup("migration_debug=", setup_migration_debug);
-
-/*
- * Maximum cache-size that the scheduler should try to measure.
- * Architectures with larger caches should tune this up during
- * bootup. Gets used in the domain-setup code (i.e. during SMP
- * bootup).
- */
-unsigned int max_cache_size;
-
-static int __init setup_max_cache_size(char *str)
-{
- get_option(&str, &max_cache_size);
- return 1;
-}
-
-__setup("max_cache_size=", setup_max_cache_size);
-
-/*
- * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
- * is the operation that is timed, so we try to generate unpredictable
- * cachemisses that still end up filling the L2 cache:
- */
-static void touch_cache(void *__cache, unsigned long __size)
-{
- unsigned long size = __size / sizeof(long);
- unsigned long chunk1 = size / 3;
- unsigned long chunk2 = 2 * size / 3;
- unsigned long *cache = __cache;
- int i;
-
- for (i = 0; i < size/6; i += 8) {
- switch (i % 6) {
- case 0: cache[i]++;
- case 1: cache[size-1-i]++;
- case 2: cache[chunk1-i]++;
- case 3: cache[chunk1+i]++;
- case 4: cache[chunk2-i]++;
- case 5: cache[chunk2+i]++;
- }
- }
-}
-
-/*
- * Measure the cache-cost of one task migration. Returns in units of nsec.
- */
-static unsigned long long
-measure_one(void *cache, unsigned long size, int source, int target)
-{
- cpumask_t mask, saved_mask;
- unsigned long long t0, t1, t2, t3, cost;
-
- saved_mask = current->cpus_allowed;
-
- /*
- * Flush source caches to RAM and invalidate them:
- */
- sched_cacheflush();
-
- /*
- * Migrate to the source CPU:
- */
- mask = cpumask_of_cpu(source);
- set_cpus_allowed(current, mask);
- WARN_ON(smp_processor_id() != source);
-
- /*
- * Dirty the working set:
- */
- t0 = sched_clock();
- touch_cache(cache, size);
- t1 = sched_clock();
-
- /*
- * Migrate to the target CPU, dirty the L2 cache and access
- * the shared buffer. (which represents the working set
- * of a migrated task.)
- */
- mask = cpumask_of_cpu(target);
- set_cpus_allowed(current, mask);
- WARN_ON(smp_processor_id() != target);
-
- t2 = sched_clock();
- touch_cache(cache, size);
- t3 = sched_clock();
-
- cost = t1-t0 + t3-t2;
-
- if (migration_debug >= 2)
- printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
- source, target, t1-t0, t1-t0, t3-t2, cost);
- /*
- * Flush target caches to RAM and invalidate them:
- */
- sched_cacheflush();
-
- set_cpus_allowed(current, saved_mask);
-
- return cost;
-}
-
-/*
- * Measure a series of task migrations and return the average
- * result. Since this code runs early during bootup the system
- * is 'undisturbed' and the average latency makes sense.
- *
- * The algorithm in essence auto-detects the relevant cache-size,
- * so it will properly detect different cachesizes for different
- * cache-hierarchies, depending on how the CPUs are connected.
- *
- * Architectures can prime the upper limit of the search range via
- * max_cache_size, otherwise the search range defaults to 20MB...64K.
- */
-static unsigned long long
-measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
-{
- unsigned long long cost1, cost2;
- int i;
-
- /*
- * Measure the migration cost of 'size' bytes, over an
- * average of 10 runs:
- *
- * (We perturb the cache size by a small (0..4k)
- * value to compensate size/alignment related artifacts.
- * We also subtract the cost of the operation done on
- * the same CPU.)
- */
- cost1 = 0;
-
- /*
- * dry run, to make sure we start off cache-cold on cpu1,
- * and to get any vmalloc pagefaults in advance:
- */
- measure_one(cache, size, cpu1, cpu2);
- for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
-
- measure_one(cache, size, cpu2, cpu1);
- for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
-
- /*
- * (We measure the non-migrating [cached] cost on both
- * cpu1 and cpu2, to handle CPUs with different speeds)
- */
- cost2 = 0;
-
- measure_one(cache, size, cpu1, cpu1);
- for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
-
- measure_one(cache, size, cpu2, cpu2);
- for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
-
- /*
- * Get the per-iteration migration cost:
- */
- do_div(cost1, 2 * ITERATIONS);
- do_div(cost2, 2 * ITERATIONS);
-
- return cost1 - cost2;
-}
-
-static unsigned long long measure_migration_cost(int cpu1, int cpu2)
-{
- unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
- unsigned int max_size, size, size_found = 0;
- long long cost = 0, prev_cost;
- void *cache;
-
- /*
- * Search from max_cache_size*5 down to 64K - the real relevant
- * cachesize has to lie somewhere inbetween.
- */
- if (max_cache_size) {
- max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
- size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
- } else {
- /*
- * Since we have no estimation about the relevant
- * search range
- */
- max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
- size = MIN_CACHE_SIZE;
- }
-
- if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
- printk("cpu %d and %d not both online!\n", cpu1, cpu2);
- return 0;
- }
-
- /*
- * Allocate the working set:
- */
- cache = vmalloc(max_size);
- if (!cache) {
- printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
- return 1000000; /* return 1 msec on very small boxen */
- }
-
- while (size <= max_size) {
- prev_cost = cost;
- cost = measure_cost(cpu1, cpu2, cache, size);
-
- /*
- * Update the max:
- */
- if (cost > 0) {
- if (max_cost < cost) {
- max_cost = cost;
- size_found = size;
- }
- }
- /*
- * Calculate average fluctuation, we use this to prevent
- * noise from triggering an early break out of the loop:
- */
- fluct = abs(cost - prev_cost);
- avg_fluct = (avg_fluct + fluct)/2;
-
- if (migration_debug)
- printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
- "(%8Ld %8Ld)\n",
- cpu1, cpu2, size,
- (long)cost / 1000000,
- ((long)cost / 100000) % 10,
- (long)max_cost / 1000000,
- ((long)max_cost / 100000) % 10,
- domain_distance(cpu1, cpu2),
- cost, avg_fluct);
-
- /*
- * If we iterated at least 20% past the previous maximum,
- * and the cost has dropped by more than 20% already,
- * (taking fluctuations into account) then we assume to
- * have found the maximum and break out of the loop early:
- */
- if (size_found && (size*100 > size_found*SIZE_THRESH))
- if (cost+avg_fluct <= 0 ||
- max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
-
- if (migration_debug)
- printk("-> found max.\n");
- break;
- }
- /*
- * Increase the cachesize in 10% steps:
- */
- size = size * 10 / 9;
- }
-
- if (migration_debug)
- printk("[%d][%d] working set size found: %d, cost: %Ld\n",
- cpu1, cpu2, size_found, max_cost);
-
- vfree(cache);
-
- /*
- * A task is considered 'cache cold' if at least 2 times
- * the worst-case cost of migration has passed.
- *
- * (this limit is only listened to if the load-balancing
- * situation is 'nice' - if there is a large imbalance we
- * ignore it for the sake of CPU utilization and
- * processing fairness.)
- */
- return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
-}
-
-static void calibrate_migration_costs(const cpumask_t *cpu_map)
-{
- int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
- unsigned long j0, j1, distance, max_distance = 0;
- struct sched_domain *sd;
-
- j0 = jiffies;
-
- /*
- * First pass - calculate the cacheflush times:
- */
- for_each_cpu_mask(cpu1, *cpu_map) {
- for_each_cpu_mask(cpu2, *cpu_map) {
- if (cpu1 == cpu2)
- continue;
- distance = domain_distance(cpu1, cpu2);
- max_distance = max(max_distance, distance);
- /*
- * No result cached yet?
- */
- if (migration_cost[distance] == -1LL)
- migration_cost[distance] =
- measure_migration_cost(cpu1, cpu2);
- }
- }
- /*
- * Second pass - update the sched domain hierarchy with
- * the new cache-hot-time estimations:
- */
- for_each_cpu_mask(cpu, *cpu_map) {
- distance = 0;
- for_each_domain(cpu, sd) {
- sd->cache_hot_time = migration_cost[distance];
- distance++;
- }
- }
- /*
- * Print the matrix:
- */
- if (migration_debug)
- printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
- max_cache_size,
-#ifdef CONFIG_X86
- cpu_khz/1000
-#else
- -1
-#endif
- );
- if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
- printk("migration_cost=");
- for (distance = 0; distance <= max_distance; distance++) {
- if (distance)
- printk(",");
- printk("%ld", (long)migration_cost[distance] / 1000);
- }
- printk("\n");
- }
- j1 = jiffies;
- if (migration_debug)
- printk("migration: %ld seconds\n", (j1-j0) / HZ);
-
- /*
- * Move back to the original CPU. NUMA-Q gets confused
- * if we migrate to another quad during bootup.
- */
- if (raw_smp_processor_id() != orig_cpu) {
- cpumask_t mask = cpumask_of_cpu(orig_cpu),
- saved_mask = current->cpus_allowed;
-
- set_cpus_allowed(current, mask);
- set_cpus_allowed(current, saved_mask);
- }
-}
-
#ifdef CONFIG_NUMA
/**
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);
#endif
cpu_attach_domain(sd, i);
}
- /*
- * Tune cache-hot values:
- */
- calibrate_migration_costs(cpu_map);
return 0;
{
int err;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
detach_destroy_domains(&cpu_online_map);
err = arch_init_sched_domains(&cpu_online_map);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return err;
}
{
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/*
* Fall through and re-initialise the domains.
*/
{
cpumask_t non_isolated_cpus;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
arch_init_sched_domains(&cpu_online_map);
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
if (cpus_empty(non_isolated_cpus))
cpu_set(smp_processor_id(), non_isolated_cpus);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
void normalize_rt_tasks(void)
{
struct prio_array *array;
- struct task_struct *p;
+ struct task_struct *g, *p;
unsigned long flags;
struct rq *rq;
read_lock_irq(&tasklist_lock);
- for_each_process(p) {
+
+ do_each_thread(g, p) {
if (!rt_task(p))
continue;
__task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
- }
+ } while_each_thread(g, p);
+
read_unlock_irq(&tasklist_lock);
}