staging: ks7010: bail out when registering netdevice fails
[cascardo/linux.git] / drivers / cpufreq / powernv-cpufreq.c
index 39ac78c..54c4536 100644 (file)
 #include <asm/reg.h>
 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
 #include <asm/opal.h>
+#include <linux/timer.h>
 
 #define POWERNV_MAX_PSTATES    256
 #define PMSR_PSAFE_ENABLE      (1UL << 30)
 #define PMSR_SPR_EM_DISABLE    (1UL << 31)
 #define PMSR_MAX(x)            ((x >> 32) & 0xFF)
 
+#define MAX_RAMP_DOWN_TIME                             5120
+/*
+ * On an idle system we want the global pstate to ramp-down from max value to
+ * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
+ * then ramp-down rapidly later on.
+ *
+ * This gives a percentage rampdown for time elapsed in milliseconds.
+ * ramp_down_percentage = ((ms * ms) >> 18)
+ *                     ~= 3.8 * (sec * sec)
+ *
+ * At 0 ms     ramp_down_percent = 0
+ * At 5120 ms  ramp_down_percent = 100
+ */
+#define ramp_down_percent(time)                ((time * time) >> 18)
+
+/* Interval after which the timer is queued to bring down global pstate */
+#define GPSTATE_TIMER_INTERVAL                         2000
+
+/**
+ * struct global_pstate_info - Per policy data structure to maintain history of
+ *                             global pstates
+ * @highest_lpstate:           The local pstate from which we are ramping down
+ * @elapsed_time:              Time in ms spent in ramping down from
+ *                             highest_lpstate
+ * @last_sampled_time:         Time from boot in ms when global pstates were
+ *                             last set
+ * @last_lpstate,last_gpstate: Last set values for local and global pstates
+ * @timer:                     Is used for ramping down if cpu goes idle for
+ *                             a long time with global pstate held high
+ * @gpstate_lock:              A spinlock to maintain synchronization between
+ *                             routines called by the timer handler and
+ *                             governer's target_index calls
+ */
+struct global_pstate_info {
+       int highest_lpstate;
+       unsigned int elapsed_time;
+       unsigned int last_sampled_time;
+       int last_lpstate;
+       int last_gpstate;
+       spinlock_t gpstate_lock;
+       struct timer_list timer;
+};
+
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 
@@ -94,6 +138,17 @@ static struct powernv_pstate_info {
        int nr_pstates;
 } powernv_pstate_info;
 
+static inline void reset_gpstates(struct cpufreq_policy *policy)
+{
+       struct global_pstate_info *gpstates = policy->driver_data;
+
+       gpstates->highest_lpstate = 0;
+       gpstates->elapsed_time = 0;
+       gpstates->last_sampled_time = 0;
+       gpstates->last_lpstate = 0;
+       gpstates->last_gpstate = 0;
+}
+
 /*
  * Initialize the freq table based on data obtained
  * from the firmware passed via device-tree
@@ -285,6 +340,7 @@ static inline void set_pmspr(unsigned long sprn, unsigned long val)
 struct powernv_smp_call_data {
        unsigned int freq;
        int pstate_id;
+       int gpstate_id;
 };
 
 /*
@@ -343,19 +399,21 @@ static unsigned int powernv_cpufreq_get(unsigned int cpu)
  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
  * on this CPU should be present in freq_data->pstate_id.
  */
-static void set_pstate(void *freq_data)
+static void set_pstate(void *data)
 {
        unsigned long val;
-       unsigned long pstate_ul =
-               ((struct powernv_smp_call_data *) freq_data)->pstate_id;
+       struct powernv_smp_call_data *freq_data = data;
+       unsigned long pstate_ul = freq_data->pstate_id;
+       unsigned long gpstate_ul = freq_data->gpstate_id;
 
        val = get_pmspr(SPRN_PMCR);
        val = val & 0x0000FFFFFFFFFFFFULL;
 
        pstate_ul = pstate_ul & 0xFF;
+       gpstate_ul = gpstate_ul & 0xFF;
 
        /* Set both global(bits 56..63) and local(bits 48..55) PStates */
-       val = val | (pstate_ul << 56) | (pstate_ul << 48);
+       val = val | (gpstate_ul << 56) | (pstate_ul << 48);
 
        pr_debug("Setting cpu %d pmcr to %016lX\n",
                        raw_smp_processor_id(), val);
@@ -424,6 +482,111 @@ next:
        }
 }
 
+/**
+ * calc_global_pstate - Calculate global pstate
+ * @elapsed_time:      Elapsed time in milliseconds
+ * @local_pstate:      New local pstate
+ * @highest_lpstate:   pstate from which its ramping down
+ *
+ * Finds the appropriate global pstate based on the pstate from which its
+ * ramping down and the time elapsed in ramping down. It follows a quadratic
+ * equation which ensures that it reaches ramping down to pmin in 5sec.
+ */
+static inline int calc_global_pstate(unsigned int elapsed_time,
+                                    int highest_lpstate, int local_pstate)
+{
+       int pstate_diff;
+
+       /*
+        * Using ramp_down_percent we get the percentage of rampdown
+        * that we are expecting to be dropping. Difference between
+        * highest_lpstate and powernv_pstate_info.min will give a absolute
+        * number of how many pstates we will drop eventually by the end of
+        * 5 seconds, then just scale it get the number pstates to be dropped.
+        */
+       pstate_diff =  ((int)ramp_down_percent(elapsed_time) *
+                       (highest_lpstate - powernv_pstate_info.min)) / 100;
+
+       /* Ensure that global pstate is >= to local pstate */
+       if (highest_lpstate - pstate_diff < local_pstate)
+               return local_pstate;
+       else
+               return highest_lpstate - pstate_diff;
+}
+
+static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
+{
+       unsigned int timer_interval;
+
+       /*
+        * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
+        * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
+        * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
+        * seconds of ramp down time.
+        */
+       if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
+            > MAX_RAMP_DOWN_TIME)
+               timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
+       else
+               timer_interval = GPSTATE_TIMER_INTERVAL;
+
+       mod_timer_pinned(&gpstates->timer, jiffies +
+                       msecs_to_jiffies(timer_interval));
+}
+
+/**
+ * gpstate_timer_handler
+ *
+ * @data: pointer to cpufreq_policy on which timer was queued
+ *
+ * This handler brings down the global pstate closer to the local pstate
+ * according quadratic equation. Queues a new timer if it is still not equal
+ * to local pstate
+ */
+void gpstate_timer_handler(unsigned long data)
+{
+       struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+       struct global_pstate_info *gpstates = policy->driver_data;
+       int gpstate_id;
+       unsigned int time_diff = jiffies_to_msecs(jiffies)
+                                       - gpstates->last_sampled_time;
+       struct powernv_smp_call_data freq_data;
+
+       if (!spin_trylock(&gpstates->gpstate_lock))
+               return;
+
+       gpstates->last_sampled_time += time_diff;
+       gpstates->elapsed_time += time_diff;
+       freq_data.pstate_id = gpstates->last_lpstate;
+
+       if ((gpstates->last_gpstate == freq_data.pstate_id) ||
+           (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+               gpstate_id = freq_data.pstate_id;
+               reset_gpstates(policy);
+               gpstates->highest_lpstate = freq_data.pstate_id;
+       } else {
+               gpstate_id = calc_global_pstate(gpstates->elapsed_time,
+                                               gpstates->highest_lpstate,
+                                               freq_data.pstate_id);
+       }
+
+       /*
+        * If local pstate is equal to global pstate, rampdown is over
+        * So timer is not required to be queued.
+        */
+       if (gpstate_id != freq_data.pstate_id)
+               queue_gpstate_timer(gpstates);
+
+       freq_data.gpstate_id = gpstate_id;
+       gpstates->last_gpstate = freq_data.gpstate_id;
+       gpstates->last_lpstate = freq_data.pstate_id;
+
+       spin_unlock(&gpstates->gpstate_lock);
+
+       /* Timer may get migrated to a different cpu on cpu hot unplug */
+       smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
+}
+
 /*
  * powernv_cpufreq_target_index: Sets the frequency corresponding to
  * the cpufreq table entry indexed by new_index on the cpus in the
@@ -433,6 +596,8 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
                                        unsigned int new_index)
 {
        struct powernv_smp_call_data freq_data;
+       unsigned int cur_msec, gpstate_id;
+       struct global_pstate_info *gpstates = policy->driver_data;
 
        if (unlikely(rebooting) && new_index != get_nominal_index())
                return 0;
@@ -440,28 +605,81 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
        if (!throttled)
                powernv_cpufreq_throttle_check(NULL);
 
+       cur_msec = jiffies_to_msecs(get_jiffies_64());
+
+       spin_lock(&gpstates->gpstate_lock);
        freq_data.pstate_id = powernv_freqs[new_index].driver_data;
 
+       if (!gpstates->last_sampled_time) {
+               gpstate_id = freq_data.pstate_id;
+               gpstates->highest_lpstate = freq_data.pstate_id;
+               goto gpstates_done;
+       }
+
+       if (gpstates->last_gpstate > freq_data.pstate_id) {
+               gpstates->elapsed_time += cur_msec -
+                                                gpstates->last_sampled_time;
+
+               /*
+                * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
+                * we should be resetting all global pstate related data. Set it
+                * equal to local pstate to start fresh.
+                */
+               if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
+                       reset_gpstates(policy);
+                       gpstates->highest_lpstate = freq_data.pstate_id;
+                       gpstate_id = freq_data.pstate_id;
+               } else {
+               /* Elaspsed_time is less than 5 seconds, continue to rampdown */
+                       gpstate_id = calc_global_pstate(gpstates->elapsed_time,
+                                                       gpstates->highest_lpstate,
+                                                       freq_data.pstate_id);
+               }
+       } else {
+               reset_gpstates(policy);
+               gpstates->highest_lpstate = freq_data.pstate_id;
+               gpstate_id = freq_data.pstate_id;
+       }
+
+       /*
+        * If local pstate is equal to global pstate, rampdown is over
+        * So timer is not required to be queued.
+        */
+       if (gpstate_id != freq_data.pstate_id)
+               queue_gpstate_timer(gpstates);
+       else
+               del_timer_sync(&gpstates->timer);
+
+gpstates_done:
+       freq_data.gpstate_id = gpstate_id;
+       gpstates->last_sampled_time = cur_msec;
+       gpstates->last_gpstate = freq_data.gpstate_id;
+       gpstates->last_lpstate = freq_data.pstate_id;
+
+       spin_unlock(&gpstates->gpstate_lock);
+
        /*
         * Use smp_call_function to send IPI and execute the
         * mtspr on target CPU.  We could do that without IPI
         * if current CPU is within policy->cpus (core)
         */
        smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
-
        return 0;
 }
 
 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-       int base, i;
+       int base, i, ret;
+       struct kernfs_node *kn;
+       struct global_pstate_info *gpstates;
 
        base = cpu_first_thread_sibling(policy->cpu);
 
        for (i = 0; i < threads_per_core; i++)
                cpumask_set_cpu(base + i, policy->cpus);
 
-       if (!policy->driver_data) {
+       kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
+       if (!kn) {
                int ret;
 
                ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
@@ -470,13 +688,37 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
                                policy->cpu);
                        return ret;
                }
-               /*
-                * policy->driver_data is used as a flag for one-time
-                * creation of throttle sysfs files.
-                */
-               policy->driver_data = policy;
+       } else {
+               kernfs_put(kn);
        }
-       return cpufreq_table_validate_and_show(policy, powernv_freqs);
+
+       gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
+       if (!gpstates)
+               return -ENOMEM;
+
+       policy->driver_data = gpstates;
+
+       /* initialize timer */
+       init_timer_deferrable(&gpstates->timer);
+       gpstates->timer.data = (unsigned long)policy;
+       gpstates->timer.function = gpstate_timer_handler;
+       gpstates->timer.expires = jiffies +
+                               msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
+       spin_lock_init(&gpstates->gpstate_lock);
+       ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
+
+       if (ret < 0)
+               kfree(policy->driver_data);
+
+       return ret;
+}
+
+static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+       /* timer is deleted in cpufreq_cpu_stop() */
+       kfree(policy->driver_data);
+
+       return 0;
 }
 
 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
@@ -604,15 +846,19 @@ static struct notifier_block powernv_cpufreq_opal_nb = {
 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 {
        struct powernv_smp_call_data freq_data;
+       struct global_pstate_info *gpstates = policy->driver_data;
 
        freq_data.pstate_id = powernv_pstate_info.min;
+       freq_data.gpstate_id = powernv_pstate_info.min;
        smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
+       del_timer_sync(&gpstates->timer);
 }
 
 static struct cpufreq_driver powernv_cpufreq_driver = {
        .name           = "powernv-cpufreq",
        .flags          = CPUFREQ_CONST_LOOPS,
        .init           = powernv_cpufreq_cpu_init,
+       .exit           = powernv_cpufreq_cpu_exit,
        .verify         = cpufreq_generic_frequency_table_verify,
        .target_index   = powernv_cpufreq_target_index,
        .get            = powernv_cpufreq_get,