From e128c864070055e062f6c90c64c03aad18452ac3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 3 Dec 2015 09:37:49 +0530 Subject: cpufreq: ondemand: Update sampling rate only for concerned policies We are comparing policy->governor against cpufreq_gov_ondemand to make sure that we update sampling rate only for the concerned CPUs. But that isn't enough. In case of governor_per_policy, there can be multiple instances of ondemand governor and we will always end up updating all of them with current code. What we rather need to do, is to compare dbs_data with poilcy->governor_data, which will match only for the policies governed by dbs_data. This code is also racy as the governor might be getting stopped at that time and we may end up scheduling work for a policy, which we have just disabled. Fix that by protecting the entire function with &od_dbs_cdata.mutex, which will prevent against races with policy START/STOP/etc. After these locks are in place, we can safely get the policy via per-cpu dbs_info. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_ondemand.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 03ac6ce54042..089ca6a6ca02 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -252,20 +252,39 @@ static void update_sampling_rate(struct dbs_data *dbs_data, od_tuners->sampling_rate = new_rate = max(new_rate, dbs_data->min_sampling_rate); + /* + * Lock governor so that governor start/stop can't execute in parallel. + */ + mutex_lock(&od_dbs_cdata.mutex); + for_each_online_cpu(cpu) { struct cpufreq_policy *policy; struct od_cpu_dbs_info_s *dbs_info; + struct cpu_dbs_info *cdbs; + struct cpu_common_dbs_info *shared; unsigned long next_sampling, appointed_at; - policy = cpufreq_cpu_get(cpu); - if (!policy) + dbs_info = &per_cpu(od_cpu_dbs_info, cpu); + cdbs = &dbs_info->cdbs; + shared = cdbs->shared; + + /* + * A valid shared and shared->policy means governor hasn't + * stopped or exited yet. + */ + if (!shared || !shared->policy) continue; - if (policy->governor != &cpufreq_gov_ondemand) { - cpufreq_cpu_put(policy); + + policy = shared->policy; + + /* + * Update sampling rate for CPUs whose policy is governed by + * dbs_data. In case of governor_per_policy, only a single + * policy will be governed by dbs_data, otherwise there can be + * multiple policies that are governed by the same dbs_data. + */ + if (dbs_data != policy->governor_data) continue; - } - dbs_info = &per_cpu(od_cpu_dbs_info, cpu); - cpufreq_cpu_put(policy); if (!delayed_work_pending(&dbs_info->cdbs.dwork)) continue; @@ -281,6 +300,8 @@ static void update_sampling_rate(struct dbs_data *dbs_data, } } + + mutex_unlock(&od_dbs_cdata.mutex); } static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf, -- cgit v1.2.3 From e68fe18c5b5442baca162ccf3b273326e6132a51 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 3 Dec 2015 09:37:50 +0530 Subject: cpufreq: ondemand: Work is guaranteed to be pending We are guaranteed to have works scheduled for policy->cpus, as the policy isn't stopped yet. And so there is no need to check that again. Drop it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_ondemand.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 089ca6a6ca02..08f2aa602f9e 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -286,9 +286,6 @@ static void update_sampling_rate(struct dbs_data *dbs_data, if (dbs_data != policy->governor_data) continue; - if (!delayed_work_pending(&dbs_info->cdbs.dwork)) - continue; - next_sampling = jiffies + usecs_to_jiffies(new_rate); appointed_at = dbs_info->cdbs.dwork.timer.expires; -- cgit v1.2.3 From affde5d06af1e39c2929e36a063e3912f02fc58f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 3 Dec 2015 09:37:51 +0530 Subject: cpufreq: governor: Pass policy as argument to ->gov_dbs_timer() Pass 'policy' as argument to ->gov_dbs_timer() instead of cdbs and dbs_data. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_conservative.c | 6 +++--- drivers/cpufreq/cpufreq_governor.c | 2 +- drivers/cpufreq/cpufreq_governor.h | 3 +-- drivers/cpufreq/cpufreq_ondemand.c | 5 ++--- 4 files changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 1fa1deb6e91f..606ad74abe6e 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -115,13 +115,13 @@ static void cs_check_cpu(int cpu, unsigned int load) } } -static unsigned int cs_dbs_timer(struct cpu_dbs_info *cdbs, - struct dbs_data *dbs_data, bool modify_all) +static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all) { + struct dbs_data *dbs_data = policy->governor_data; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; if (modify_all) - dbs_check_cpu(dbs_data, cdbs->shared->policy->cpu); + dbs_check_cpu(dbs_data, policy->cpu); return delay_for_sampling_rate(cs_tuners->sampling_rate); } diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index b260576ddb12..cdcb56a49b28 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -253,7 +253,7 @@ static void dbs_timer(struct work_struct *work) if (!need_load_eval(cdbs->shared, sampling_rate)) modify_all = false; - delay = dbs_data->cdata->gov_dbs_timer(cdbs, dbs_data, modify_all); + delay = dbs_data->cdata->gov_dbs_timer(policy, modify_all); gov_queue_work(dbs_data, policy, delay, modify_all); unlock: diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index 5621bb03e874..0c7589016b6c 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -209,8 +209,7 @@ struct common_dbs_data { struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu); void *(*get_cpu_dbs_info_s)(int cpu); - unsigned int (*gov_dbs_timer)(struct cpu_dbs_info *cdbs, - struct dbs_data *dbs_data, + unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy, bool modify_all); void (*gov_check_cpu)(int cpu, unsigned int load); int (*init)(struct dbs_data *dbs_data, bool notify); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 08f2aa602f9e..fc0384b4d02d 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -191,10 +191,9 @@ static void od_check_cpu(int cpu, unsigned int load) } } -static unsigned int od_dbs_timer(struct cpu_dbs_info *cdbs, - struct dbs_data *dbs_data, bool modify_all) +static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all) { - struct cpufreq_policy *policy = cdbs->shared->policy; + struct dbs_data *dbs_data = policy->governor_data; unsigned int cpu = policy->cpu; struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); -- cgit v1.2.3 From 5e4500d8dba16d88b528cf037566b84747ec23f0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 3 Dec 2015 09:37:52 +0530 Subject: cpufreq: governor: initialize/destroy timer_mutex with 'shared' timer_mutex is required to be initialized only while memory for 'shared' is allocated and in a similar way it is required to be destroyed only when memory for 'shared' is freed. There is no need to do the same every time we start/stop the governor. Move code to initialize/destroy timer_mutex to the relevant places. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index cdcb56a49b28..999e1f6addf9 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -287,6 +287,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy, for_each_cpu(j, policy->related_cpus) cdata->get_cpu_cdbs(j)->shared = shared; + mutex_init(&shared->timer_mutex); return 0; } @@ -297,6 +298,8 @@ static void free_common_dbs_info(struct cpufreq_policy *policy, struct cpu_common_dbs_info *shared = cdbs->shared; int j; + mutex_destroy(&shared->timer_mutex); + for_each_cpu(j, policy->cpus) cdata->get_cpu_cdbs(j)->shared = NULL; @@ -433,7 +436,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy, shared->policy = policy; shared->time_stamp = ktime_get(); - mutex_init(&shared->timer_mutex); for_each_cpu(j, policy->cpus) { struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j); @@ -493,8 +495,6 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy, mutex_unlock(&shared->timer_mutex); gov_cancel_work(dbs_data, policy); - - mutex_destroy(&shared->timer_mutex); return 0; } -- cgit v1.2.3 From 70f43e5e798c8818d97d8d6a9bd4cd3235af9686 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 9 Dec 2015 07:34:42 +0530 Subject: cpufreq: governor: replace per-CPU delayed work with timers cpufreq governors evaluate load at sampling rate and based on that they update frequency for a group of CPUs belonging to the same cpufreq policy. This is required to be done in a single thread for all policy->cpus, but because we don't want to wakeup idle CPUs to do just that, we use deferrable work for this. If we would have used a single delayed deferrable work for the entire policy, there were chances that the CPU required to run the handler can be in idle and we might end up not changing the frequency for the entire group with load variations. And so we were forced to keep per-cpu works, and only the one that expires first need to do the real work and others are rescheduled for next sampling time. We have been using the more complex solution until now, where we used a delayed deferrable work for this, which is a combination of a timer and a work. This could be made lightweight by keeping per-cpu deferred timers with a single work item, which is scheduled by the first timer that expires. This patch does just that and here are important changes: - The timer handler will run in irq context and so we need to use a spin_lock instead of the timer_mutex. And so a separate timer_lock is created. This also makes the use of the mutex and lock quite clear, as we know what exactly they are protecting. - A new field 'skip_work' is added to track when the timer handlers can queue a work. More comments present in code. Suggested-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar Reviewed-by: Ashwin Chaugule Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 142 ++++++++++++++++++++++--------------- drivers/cpufreq/cpufreq_governor.h | 20 ++++-- drivers/cpufreq/cpufreq_ondemand.c | 8 +-- 3 files changed, 100 insertions(+), 70 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 999e1f6addf9..2d61eae5cc5d 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -158,47 +158,53 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu) } EXPORT_SYMBOL_GPL(dbs_check_cpu); -static inline void __gov_queue_work(int cpu, struct dbs_data *dbs_data, - unsigned int delay) +void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay) { - struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); - - mod_delayed_work_on(cpu, system_wq, &cdbs->dwork, delay); -} - -void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy, - unsigned int delay, bool all_cpus) -{ - int i; + struct dbs_data *dbs_data = policy->governor_data; + struct cpu_dbs_info *cdbs; + int cpu; - if (!all_cpus) { - /* - * Use raw_smp_processor_id() to avoid preemptible warnings. - * We know that this is only called with all_cpus == false from - * works that have been queued with *_work_on() functions and - * those works are canceled during CPU_DOWN_PREPARE so they - * can't possibly run on any other CPU. - */ - __gov_queue_work(raw_smp_processor_id(), dbs_data, delay); - } else { - for_each_cpu(i, policy->cpus) - __gov_queue_work(i, dbs_data, delay); + for_each_cpu(cpu, policy->cpus) { + cdbs = dbs_data->cdata->get_cpu_cdbs(cpu); + cdbs->timer.expires = jiffies + delay; + add_timer_on(&cdbs->timer, cpu); } } -EXPORT_SYMBOL_GPL(gov_queue_work); +EXPORT_SYMBOL_GPL(gov_add_timers); -static inline void gov_cancel_work(struct dbs_data *dbs_data, - struct cpufreq_policy *policy) +static inline void gov_cancel_timers(struct cpufreq_policy *policy) { + struct dbs_data *dbs_data = policy->governor_data; struct cpu_dbs_info *cdbs; int i; for_each_cpu(i, policy->cpus) { cdbs = dbs_data->cdata->get_cpu_cdbs(i); - cancel_delayed_work_sync(&cdbs->dwork); + del_timer_sync(&cdbs->timer); } } +void gov_cancel_work(struct cpu_common_dbs_info *shared) +{ + unsigned long flags; + + /* + * No work will be queued from timer handlers after skip_work is + * updated. And so we can safely cancel the work first and then the + * timers. + */ + spin_lock_irqsave(&shared->timer_lock, flags); + shared->skip_work++; + spin_unlock_irqrestore(&shared->timer_lock, flags); + + cancel_work_sync(&shared->work); + + gov_cancel_timers(shared->policy); + + shared->skip_work = 0; +} +EXPORT_SYMBOL_GPL(gov_cancel_work); + /* Will return if we need to evaluate cpu load again or not */ static bool need_load_eval(struct cpu_common_dbs_info *shared, unsigned int sampling_rate) @@ -217,29 +223,22 @@ static bool need_load_eval(struct cpu_common_dbs_info *shared, return true; } -static void dbs_timer(struct work_struct *work) +static void dbs_work_handler(struct work_struct *work) { - struct cpu_dbs_info *cdbs = container_of(work, struct cpu_dbs_info, - dwork.work); - struct cpu_common_dbs_info *shared = cdbs->shared; + struct cpu_common_dbs_info *shared = container_of(work, struct + cpu_common_dbs_info, work); struct cpufreq_policy *policy; struct dbs_data *dbs_data; unsigned int sampling_rate, delay; - bool modify_all = true; - - mutex_lock(&shared->timer_mutex); + unsigned long flags; + bool eval_load; policy = shared->policy; - - /* - * Governor might already be disabled and there is no point continuing - * with the work-handler. - */ - if (!policy) - goto unlock; - dbs_data = policy->governor_data; + /* Kill all timers */ + gov_cancel_timers(policy); + if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; @@ -250,14 +249,43 @@ static void dbs_timer(struct work_struct *work) sampling_rate = od_tuners->sampling_rate; } - if (!need_load_eval(cdbs->shared, sampling_rate)) - modify_all = false; + eval_load = need_load_eval(shared, sampling_rate); - delay = dbs_data->cdata->gov_dbs_timer(policy, modify_all); - gov_queue_work(dbs_data, policy, delay, modify_all); - -unlock: + /* + * Make sure cpufreq_governor_limits() isn't evaluating load in + * parallel. + */ + mutex_lock(&shared->timer_mutex); + delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load); mutex_unlock(&shared->timer_mutex); + + spin_lock_irqsave(&shared->timer_lock, flags); + shared->skip_work--; + spin_unlock_irqrestore(&shared->timer_lock, flags); + + gov_add_timers(policy, delay); +} + +static void dbs_timer_handler(unsigned long data) +{ + struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data; + struct cpu_common_dbs_info *shared = cdbs->shared; + unsigned long flags; + + spin_lock_irqsave(&shared->timer_lock, flags); + + /* + * Timer handler isn't allowed to queue work at the moment, because: + * - Another timer handler has done that + * - We are stopping the governor + * - Or we are updating the sampling rate of ondemand governor + */ + if (!shared->skip_work) { + shared->skip_work++; + queue_work(system_wq, &shared->work); + } + + spin_unlock_irqrestore(&shared->timer_lock, flags); } static void set_sampling_rate(struct dbs_data *dbs_data, @@ -288,6 +316,8 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy, cdata->get_cpu_cdbs(j)->shared = shared; mutex_init(&shared->timer_mutex); + spin_lock_init(&shared->timer_lock); + INIT_WORK(&shared->work, dbs_work_handler); return 0; } @@ -452,7 +482,9 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy, if (ignore_nice) j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; - INIT_DEFERRABLE_WORK(&j_cdbs->dwork, dbs_timer); + __setup_timer(&j_cdbs->timer, dbs_timer_handler, + (unsigned long)j_cdbs, + TIMER_DEFERRABLE | TIMER_IRQSAFE); } if (cdata->governor == GOV_CONSERVATIVE) { @@ -470,8 +502,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy, od_ops->powersave_bias_init_cpu(cpu); } - gov_queue_work(dbs_data, policy, delay_for_sampling_rate(sampling_rate), - true); + gov_add_timers(policy, delay_for_sampling_rate(sampling_rate)); return 0; } @@ -485,16 +516,9 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy, if (!shared || !shared->policy) return -EBUSY; - /* - * Work-handler must see this updated, as it should not proceed any - * further after governor is disabled. And so timer_mutex is taken while - * updating this value. - */ - mutex_lock(&shared->timer_mutex); + gov_cancel_work(shared); shared->policy = NULL; - mutex_unlock(&shared->timer_mutex); - gov_cancel_work(dbs_data, policy); return 0; } diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index 0c7589016b6c..76742902491e 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -132,12 +132,20 @@ static void *get_cpu_dbs_info_s(int cpu) \ struct cpu_common_dbs_info { struct cpufreq_policy *policy; /* - * percpu mutex that serializes governor limit change with dbs_timer - * invocation. We do not want dbs_timer to run when user is changing - * the governor or limits. + * Per policy mutex that serializes load evaluation from limit-change + * and work-handler. */ struct mutex timer_mutex; + + /* + * Per policy lock that serializes access to queuing work from timer + * handlers. + */ + spinlock_t timer_lock; + ktime_t time_stamp; + unsigned int skip_work; + struct work_struct work; }; /* Per cpu structures */ @@ -152,7 +160,7 @@ struct cpu_dbs_info { * wake-up from idle. */ unsigned int prev_load; - struct delayed_work dwork; + struct timer_list timer; struct cpu_common_dbs_info *shared; }; @@ -268,11 +276,11 @@ static ssize_t show_sampling_rate_min_gov_pol \ extern struct mutex cpufreq_governor_lock; +void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay); +void gov_cancel_work(struct cpu_common_dbs_info *shared); void dbs_check_cpu(struct dbs_data *dbs_data, int cpu); int cpufreq_governor_dbs(struct cpufreq_policy *policy, struct common_dbs_data *cdata, unsigned int event); -void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy, - unsigned int delay, bool all_cpus); void od_register_powersave_bias_handler(unsigned int (*f) (struct cpufreq_policy *, unsigned int, unsigned int), unsigned int powersave_bias); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index fc0384b4d02d..f879012cf849 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -286,13 +286,11 @@ static void update_sampling_rate(struct dbs_data *dbs_data, continue; next_sampling = jiffies + usecs_to_jiffies(new_rate); - appointed_at = dbs_info->cdbs.dwork.timer.expires; + appointed_at = dbs_info->cdbs.timer.expires; if (time_before(next_sampling, appointed_at)) { - cancel_delayed_work_sync(&dbs_info->cdbs.dwork); - - gov_queue_work(dbs_data, policy, - usecs_to_jiffies(new_rate), true); + gov_cancel_work(shared); + gov_add_timers(policy, usecs_to_jiffies(new_rate)); } } -- cgit v1.2.3 From f08f638b9c7f1bf3cb9006d3d26bf568d807ede0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 3 Dec 2015 09:37:54 +0530 Subject: cpufreq: ondemand: update update_sampling_rate() to make it more efficient Currently update_sampling_rate() runs over each online CPU and cancels/queues timers on all policy->cpus every time. This should be done just once for any cpu belonging to a policy. Create a cpumask and keep on clearing it as and when we process policies, so that we don't have to traverse through all CPUs of the same policy. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_ondemand.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index f879012cf849..eae51070c034 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -246,6 +246,7 @@ static void update_sampling_rate(struct dbs_data *dbs_data, unsigned int new_rate) { struct od_dbs_tuners *od_tuners = dbs_data->tuners; + struct cpumask cpumask; int cpu; od_tuners->sampling_rate = new_rate = max(new_rate, @@ -256,7 +257,9 @@ static void update_sampling_rate(struct dbs_data *dbs_data, */ mutex_lock(&od_dbs_cdata.mutex); - for_each_online_cpu(cpu) { + cpumask_copy(&cpumask, cpu_online_mask); + + for_each_cpu(cpu, &cpumask) { struct cpufreq_policy *policy; struct od_cpu_dbs_info_s *dbs_info; struct cpu_dbs_info *cdbs; @@ -276,6 +279,9 @@ static void update_sampling_rate(struct dbs_data *dbs_data, policy = shared->policy; + /* clear all CPUs of this policy */ + cpumask_andnot(&cpumask, &cpumask, policy->cpus); + /* * Update sampling rate for CPUs whose policy is governed by * dbs_data. In case of governor_per_policy, only a single @@ -285,6 +291,10 @@ static void update_sampling_rate(struct dbs_data *dbs_data, if (dbs_data != policy->governor_data) continue; + /* + * Checking this for any CPU should be fine, timers for all of + * them are scheduled together. + */ next_sampling = jiffies + usecs_to_jiffies(new_rate); appointed_at = dbs_info->cdbs.timer.expires; -- cgit v1.2.3 From 2dd3e724b4e2237cfaaf155cab72af02c1c420cc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 8 Dec 2015 21:44:05 +0100 Subject: cpufreq: governor: Use lockless timer function It is possible to get rid of the timer_lock spinlock used by the governor timer function for synchronization, but a couple of races need to be avoided. The first race is between multiple dbs_timer_handler() instances that may be running in parallel with each other on different CPUs. Namely, one of them has to queue up the work item, but it cannot be queued up more than once. To achieve that, atomic_inc_return() can be used on the skip_work field of struct cpu_common_dbs_info. The second race is between an already running dbs_timer_handler() and gov_cancel_work(). In that case the dbs_timer_handler() might not notice the skip_work incrementation in gov_cancel_work() and it might queue up its work item after gov_cancel_work() had returned (and that work item would corrupt skip_work going forward). To prevent that from happening, gov_cancel_work() can be made wait for the timer function to complete (on all CPUs) right after skip_work has been incremented. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq_governor.c | 49 +++++++++++++++++--------------------- drivers/cpufreq/cpufreq_governor.h | 9 ++----- 2 files changed, 24 insertions(+), 34 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 2d61eae5cc5d..4de12fd35b1f 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -186,22 +186,24 @@ static inline void gov_cancel_timers(struct cpufreq_policy *policy) void gov_cancel_work(struct cpu_common_dbs_info *shared) { - unsigned long flags; - + /* Tell dbs_timer_handler() to skip queuing up work items. */ + atomic_inc(&shared->skip_work); /* - * No work will be queued from timer handlers after skip_work is - * updated. And so we can safely cancel the work first and then the - * timers. + * If dbs_timer_handler() is already running, it may not notice the + * incremented skip_work, so wait for it to complete to prevent its work + * item from being queued up after the cancel_work_sync() below. + */ + gov_cancel_timers(shared->policy); + /* + * In case dbs_timer_handler() managed to run and spawn a work item + * before the timers have been canceled, wait for that work item to + * complete and then cancel all of the timers set up by it. If + * dbs_timer_handler() runs again at that point, it will see the + * positive value of skip_work and won't spawn any more work items. */ - spin_lock_irqsave(&shared->timer_lock, flags); - shared->skip_work++; - spin_unlock_irqrestore(&shared->timer_lock, flags); - cancel_work_sync(&shared->work); - gov_cancel_timers(shared->policy); - - shared->skip_work = 0; + atomic_set(&shared->skip_work, 0); } EXPORT_SYMBOL_GPL(gov_cancel_work); @@ -230,7 +232,6 @@ static void dbs_work_handler(struct work_struct *work) struct cpufreq_policy *policy; struct dbs_data *dbs_data; unsigned int sampling_rate, delay; - unsigned long flags; bool eval_load; policy = shared->policy; @@ -259,9 +260,7 @@ static void dbs_work_handler(struct work_struct *work) delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load); mutex_unlock(&shared->timer_mutex); - spin_lock_irqsave(&shared->timer_lock, flags); - shared->skip_work--; - spin_unlock_irqrestore(&shared->timer_lock, flags); + atomic_dec(&shared->skip_work); gov_add_timers(policy, delay); } @@ -270,22 +269,18 @@ static void dbs_timer_handler(unsigned long data) { struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data; struct cpu_common_dbs_info *shared = cdbs->shared; - unsigned long flags; - - spin_lock_irqsave(&shared->timer_lock, flags); /* - * Timer handler isn't allowed to queue work at the moment, because: + * Timer handler may not be allowed to queue the work at the moment, + * because: * - Another timer handler has done that * - We are stopping the governor - * - Or we are updating the sampling rate of ondemand governor + * - Or we are updating the sampling rate of the ondemand governor */ - if (!shared->skip_work) { - shared->skip_work++; + if (atomic_inc_return(&shared->skip_work) > 1) + atomic_dec(&shared->skip_work); + else queue_work(system_wq, &shared->work); - } - - spin_unlock_irqrestore(&shared->timer_lock, flags); } static void set_sampling_rate(struct dbs_data *dbs_data, @@ -316,7 +311,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy, cdata->get_cpu_cdbs(j)->shared = shared; mutex_init(&shared->timer_mutex); - spin_lock_init(&shared->timer_lock); + atomic_set(&shared->skip_work, 0); INIT_WORK(&shared->work, dbs_work_handler); return 0; } diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index 76742902491e..91e767a058a7 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -17,6 +17,7 @@ #ifndef _CPUFREQ_GOVERNOR_H #define _CPUFREQ_GOVERNOR_H +#include #include #include #include @@ -137,14 +138,8 @@ struct cpu_common_dbs_info { */ struct mutex timer_mutex; - /* - * Per policy lock that serializes access to queuing work from timer - * handlers. - */ - spinlock_t timer_lock; - ktime_t time_stamp; - unsigned int skip_work; + atomic_t skip_work; struct work_struct work; }; -- cgit v1.2.3 From f8fa8ae06b8c2c25d81c99766f9226adc5c3e073 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 17 Nov 2015 12:06:22 +0000 Subject: cpufreq-dt: Supply power coefficient when registering cooling devices Support registering cooling devices with dynamic power coefficient where provided by the device tree. This allows OF registered cooling devices driver to be used with the power_allocator thermal governor. Signed-off-by: Punit Agrawal Acked-by: Viresh Kumar Reviewed-by: Javi Merino Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 90d64081ddb3..1ceece9d6711 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -407,8 +407,13 @@ static void cpufreq_ready(struct cpufreq_policy *policy) * thermal DT code takes care of matching them. */ if (of_find_property(np, "#cooling-cells", NULL)) { - priv->cdev = of_cpufreq_cooling_register(np, - policy->related_cpus); + u32 power_coefficient = 0; + + of_property_read_u32(np, "dynamic-power-coefficient", + &power_coefficient); + + priv->cdev = of_cpufreq_power_cooling_register(np, + policy->related_cpus, power_coefficient, NULL); if (IS_ERR(priv->cdev)) { dev_err(priv->cpu_dev, "running cpufreq without cooling device: %ld\n", -- cgit v1.2.3 From 2f7e8a175db72bdaf377235962fd85796edb3fbc Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 17 Nov 2015 12:06:23 +0000 Subject: cpufreq: arm_big_little: Add support to register a cpufreq cooling device Register passive cooling devices when initialising cpufreq on big.LITTLE systems. If the device tree provides a dynamic power coefficient for the CPUs then the bound cooling device will support the extensions that allow it to be used with all the existing thermal governors including the power allocator governor. A cooling device will be created per individual frequency domain and can be bound to thermal zones via the thermal DT bindings. Signed-off-by: Punit Agrawal Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.arm | 2 ++ drivers/cpufreq/arm_big_little.c | 41 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 235a1ba73d92..80fbfb32b5a9 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -6,6 +6,8 @@ config ARM_BIG_LITTLE_CPUFREQ tristate "Generic ARM big LITTLE CPUfreq driver" depends on (ARM_CPU_TOPOLOGY || ARM64) && HAVE_CLK + # if CPU_THERMAL is on and THERMAL=m, ARM_BIT_LITTLE_CPUFREQ cannot be =y + depends on !CPU_THERMAL || THERMAL select PM_OPP help This enables the Generic CPUfreq driver for ARM big.LITTLE platforms. diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c index c5d256caa664..c251247ae661 100644 --- a/drivers/cpufreq/arm_big_little.c +++ b/drivers/cpufreq/arm_big_little.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ static bool bL_switching_enabled; #define ACTUAL_FREQ(cluster, freq) ((cluster == A7_CLUSTER) ? freq << 1 : freq) #define VIRT_FREQ(cluster, freq) ((cluster == A7_CLUSTER) ? freq >> 1 : freq) +static struct thermal_cooling_device *cdev[MAX_CLUSTERS]; static struct cpufreq_arm_bL_ops *arm_bL_ops; static struct clk *clk[MAX_CLUSTERS]; static struct cpufreq_frequency_table *freq_table[MAX_CLUSTERS + 1]; @@ -493,6 +495,12 @@ static int bL_cpufreq_init(struct cpufreq_policy *policy) static int bL_cpufreq_exit(struct cpufreq_policy *policy) { struct device *cpu_dev; + int cur_cluster = cpu_to_cluster(policy->cpu); + + if (cur_cluster < MAX_CLUSTERS) { + cpufreq_cooling_unregister(cdev[cur_cluster]); + cdev[cur_cluster] = NULL; + } cpu_dev = get_cpu_device(policy->cpu); if (!cpu_dev) { @@ -507,6 +515,38 @@ static int bL_cpufreq_exit(struct cpufreq_policy *policy) return 0; } +static void bL_cpufreq_ready(struct cpufreq_policy *policy) +{ + struct device *cpu_dev = get_cpu_device(policy->cpu); + int cur_cluster = cpu_to_cluster(policy->cpu); + struct device_node *np; + + /* Do not register a cpu_cooling device if we are in IKS mode */ + if (cur_cluster >= MAX_CLUSTERS) + return; + + np = of_node_get(cpu_dev->of_node); + if (WARN_ON(!np)) + return; + + if (of_find_property(np, "#cooling-cells", NULL)) { + u32 power_coefficient = 0; + + of_property_read_u32(np, "dynamic-power-coefficient", + &power_coefficient); + + cdev[cur_cluster] = of_cpufreq_power_cooling_register(np, + policy->related_cpus, power_coefficient, NULL); + if (IS_ERR(cdev[cur_cluster])) { + dev_err(cpu_dev, + "running cpufreq without cooling device: %ld\n", + PTR_ERR(cdev[cur_cluster])); + cdev[cur_cluster] = NULL; + } + } + of_node_put(np); +} + static struct cpufreq_driver bL_cpufreq_driver = { .name = "arm-big-little", .flags = CPUFREQ_STICKY | @@ -517,6 +557,7 @@ static struct cpufreq_driver bL_cpufreq_driver = { .get = bL_cpufreq_get_rate, .init = bL_cpufreq_init, .exit = bL_cpufreq_exit, + .ready = bL_cpufreq_ready, .attr = cpufreq_generic_attr, }; -- cgit v1.2.3 From 790d849bf811a8ab5d4cd2cce0f6fda92f6aebf2 Mon Sep 17 00:00:00 2001 From: Jacob Tanenbaum Date: Thu, 19 Nov 2015 10:29:01 -0500 Subject: cpufreq: pcc-cpufreq: update default value of cpuinfo_transition_latency The cpufreq documentation specifies policy->cpuinfo.transition_latency the time it takes on this CPU to switch between two frequencies in nanoseconds (if appropriate, else specify CPUFREQ_ETERNAL) currently pcc-cpufreq does not expose the value and sets it to zero. I changed the pcc-cpufreq driver and it's documentation to conform to the default value specified in Documentation/cpu-freq/cpu-drivers.txt Signed-off-by: Jacob Tanenbaum Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/pcc-cpufreq.txt | 4 ++-- drivers/cpufreq/pcc-cpufreq.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq') diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt index 9e3c3b33514c..0a94224ad296 100644 --- a/Documentation/cpu-freq/pcc-cpufreq.txt +++ b/Documentation/cpu-freq/pcc-cpufreq.txt @@ -159,8 +159,8 @@ to be strictly associated with a P-state. 2.2 cpuinfo_transition_latency: ------------------------------- -The cpuinfo_transition_latency field is 0. The PCC specification does -not include a field to expose this value currently. +The cpuinfo_transition_latency field is CPUFREQ_ETERNAL. The PCC specification +does not include a field to expose this value currently. 2.3 cpuinfo_cur_freq: --------------------- diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c index 2a0d58959acf..808a320e9d5d 100644 --- a/drivers/cpufreq/pcc-cpufreq.c +++ b/drivers/cpufreq/pcc-cpufreq.c @@ -555,6 +555,8 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->min = policy->cpuinfo.min_freq = ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + pr_debug("init: policy->max is %d, policy->min is %d\n", policy->max, policy->min); out: -- cgit v1.2.3 From 8ae1702a0df5e0730607b97fd9fd1f8066870832 Mon Sep 17 00:00:00 2001 From: Hongtao Jia Date: Thu, 26 Nov 2015 17:21:11 +0800 Subject: cpufreq: qoriq: Register cooling device based on device tree Register the qoriq cpufreq driver as a cooling device, based on the thermal device tree framework. When temperature crosses the passive trip point cpufreq is used to throttle CPUs. Signed-off-by: Jia Hongtao Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/qoriq-cpufreq.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c index 358f0752c31e..b23e525a7af3 100644 --- a/drivers/cpufreq/qoriq-cpufreq.c +++ b/drivers/cpufreq/qoriq-cpufreq.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ struct cpu_data { struct clk **pclk; struct cpufreq_frequency_table *table; + struct thermal_cooling_device *cdev; }; /** @@ -321,6 +323,27 @@ static int qoriq_cpufreq_target(struct cpufreq_policy *policy, return clk_set_parent(policy->clk, parent); } + +static void qoriq_cpufreq_ready(struct cpufreq_policy *policy) +{ + struct cpu_data *cpud = policy->driver_data; + struct device_node *np = of_get_cpu_node(policy->cpu, NULL); + + if (of_find_property(np, "#cooling-cells", NULL)) { + cpud->cdev = of_cpufreq_cooling_register(np, + policy->related_cpus); + + if (IS_ERR(cpud->cdev)) { + pr_err("Failed to register cooling device cpu%d: %ld\n", + policy->cpu, PTR_ERR(cpud->cdev)); + + cpud->cdev = NULL; + } + } + + of_node_put(np); +} + static struct cpufreq_driver qoriq_cpufreq_driver = { .name = "qoriq_cpufreq", .flags = CPUFREQ_CONST_LOOPS, @@ -329,6 +352,7 @@ static struct cpufreq_driver qoriq_cpufreq_driver = { .verify = cpufreq_generic_frequency_table_verify, .target_index = qoriq_cpufreq_target, .get = cpufreq_generic_get, + .ready = qoriq_cpufreq_ready, .attr = cpufreq_generic_attr, }; -- cgit v1.2.3 From 9bb46b87d662ab704bd852db9916f0e51db3e94b Mon Sep 17 00:00:00 2001 From: Pi-Cheng Chen Date: Sun, 29 Nov 2015 16:31:35 +0800 Subject: cpufreq: mt8173: add CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag Add CPUFREQ_HAVE_GOVERNOR_PER_POLICY to have individual set of tunables for each cluster of MT8173. Signed-off-by: Pi-Cheng Chen Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/mt8173-cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c index 83001dc5b646..c43810946446 100644 --- a/drivers/cpufreq/mt8173-cpufreq.c +++ b/drivers/cpufreq/mt8173-cpufreq.c @@ -469,7 +469,8 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy) } static struct cpufreq_driver mt8173_cpufreq_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + CPUFREQ_HAVE_GOVERNOR_PER_POLICY, .verify = cpufreq_generic_frequency_table_verify, .target_index = mtk_cpufreq_set_target, .get = cpufreq_generic_get, -- cgit v1.2.3 From 93625d52e7a74492416f77fed945ba34e0ae0c18 Mon Sep 17 00:00:00 2001 From: Pi-Cheng Chen Date: Sun, 29 Nov 2015 16:31:36 +0800 Subject: cpufreq: mt8173: remove redundant regulator_get_voltage() call Remove redundant regulator_get_voltage() call to get Vsram value since it will be obtained later at the beginning of voltage tracking loop. Signed-off-by: Pi-Cheng Chen Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/mt8173-cpufreq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c index c43810946446..750cda7876ac 100644 --- a/drivers/cpufreq/mt8173-cpufreq.c +++ b/drivers/cpufreq/mt8173-cpufreq.c @@ -59,7 +59,6 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info, int old_vproc, old_vsram, new_vsram, vsram, vproc, ret; old_vproc = regulator_get_voltage(proc_reg); - old_vsram = regulator_get_voltage(sram_reg); /* Vsram should not exceed the maximum allowed voltage of SoC. */ new_vsram = min(new_vproc + MIN_VOLT_SHIFT, MAX_VOLT_LIMIT); -- cgit v1.2.3 From 40be4c3ccbf4078e2f8426a7962879b7a447cde4 Mon Sep 17 00:00:00 2001 From: Pi-Cheng Chen Date: Sun, 29 Nov 2015 16:31:37 +0800 Subject: cpufreq: mt8173: check return value of regulator_get_voltage() call Sometimes regulator_get_voltage() call returns negative values for reasons(e.g. underlying I2C bus timeout). Add check for the return values and fail out early. Signed-off-by: Pi-Cheng Chen Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/mt8173-cpufreq.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c index 750cda7876ac..9d0fe37b4c3e 100644 --- a/drivers/cpufreq/mt8173-cpufreq.c +++ b/drivers/cpufreq/mt8173-cpufreq.c @@ -59,6 +59,10 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info, int old_vproc, old_vsram, new_vsram, vsram, vproc, ret; old_vproc = regulator_get_voltage(proc_reg); + if (old_vproc < 0) { + pr_err("%s: invalid Vproc value: %d\n", __func__, old_vproc); + return old_vproc; + } /* Vsram should not exceed the maximum allowed voltage of SoC. */ new_vsram = min(new_vproc + MIN_VOLT_SHIFT, MAX_VOLT_LIMIT); @@ -71,7 +75,17 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info, */ do { old_vsram = regulator_get_voltage(sram_reg); + if (old_vsram < 0) { + pr_err("%s: invalid Vsram value: %d\n", + __func__, old_vsram); + return old_vsram; + } old_vproc = regulator_get_voltage(proc_reg); + if (old_vproc < 0) { + pr_err("%s: invalid Vproc value: %d\n", + __func__, old_vproc); + return old_vproc; + } vsram = min(new_vsram, old_vproc + MAX_VOLT_SHIFT); @@ -116,7 +130,17 @@ static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info, */ do { old_vproc = regulator_get_voltage(proc_reg); + if (old_vproc < 0) { + pr_err("%s: invalid Vproc value: %d\n", + __func__, old_vproc); + return old_vproc; + } old_vsram = regulator_get_voltage(sram_reg); + if (old_vsram < 0) { + pr_err("%s: invalid Vsram value: %d\n", + __func__, old_vsram); + return old_vsram; + } vproc = max(new_vproc, old_vsram - MAX_VOLT_SHIFT); ret = regulator_set_voltage(proc_reg, vproc, @@ -184,6 +208,10 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy, old_freq_hz = clk_get_rate(cpu_clk); old_vproc = regulator_get_voltage(info->proc_reg); + if (old_vproc < 0) { + pr_err("%s: invalid Vproc value: %d\n", __func__, old_vproc); + return old_vproc; + } freq_hz = freq_table[index].frequency * 1000; -- cgit v1.2.3 From 157386b6fc1465f292b66c4133409033650ad335 Mon Sep 17 00:00:00 2001 From: Philippe Longepe Date: Fri, 4 Dec 2015 17:40:30 +0100 Subject: cpufreq: intel_pstate: Configurable algorithm to get target pstate Target systems using different cpus have different power and performance requirements. They may use different algorithms to get the next P-state based on their power or performance preference. For example, power-constrained systems may not want to use high-performance P-states as aggressively as a full-size desktop or a server platform. A server platform may want to run close to the max to achieve better performance, while laptop-like systems may prefer sacrificing performance for longer battery lifes. For the above reasons, modify intel_pstate to allow the target P-state selection algorithm to be depend on the CPU ID. Signed-off-by: Srinivas Pandruvada Signed-off-by: Philippe Longepe Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4d07cbd2b23c..ff58029a56e2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -66,6 +66,7 @@ static inline int ceiling_fp(int32_t x) struct sample { int32_t core_pct_busy; + int32_t busy_scaled; u64 aperf; u64 mperf; u64 tsc; @@ -133,6 +134,7 @@ struct pstate_funcs { int (*get_scaling)(void); void (*set)(struct cpudata*, int pstate); void (*get_vid)(struct cpudata *); + int32_t (*get_target_pstate)(struct cpudata *); }; struct cpu_defaults { @@ -140,6 +142,8 @@ struct cpu_defaults { struct pstate_funcs funcs; }; +static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu); + static struct pstate_adjust_policy pid_params; static struct pstate_funcs pstate_funcs; static int hwp_active; @@ -738,6 +742,7 @@ static struct cpu_defaults core_params = { .get_turbo = core_get_turbo_pstate, .get_scaling = core_get_scaling, .set = core_set_pstate, + .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -758,6 +763,7 @@ static struct cpu_defaults silvermont_params = { .set = atom_set_pstate, .get_scaling = silvermont_get_scaling, .get_vid = atom_get_vid, + .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -778,6 +784,7 @@ static struct cpu_defaults airmont_params = { .set = atom_set_pstate, .get_scaling = airmont_get_scaling, .get_vid = atom_get_vid, + .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -797,6 +804,7 @@ static struct cpu_defaults knl_params = { .get_turbo = knl_get_turbo_pstate, .get_scaling = core_get_scaling, .set = core_set_pstate, + .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -922,7 +930,7 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu) mod_timer_pinned(&cpu->timer, jiffies + delay); } -static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) +static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) { int32_t core_busy, max_pstate, current_pstate, sample_ratio; s64 duration_us; @@ -960,30 +968,24 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) core_busy = mul_fp(core_busy, sample_ratio); } - return core_busy; + cpu->sample.busy_scaled = core_busy; + return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy); } static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) { - int32_t busy_scaled; - struct _pid *pid; - signed int ctl; - int from; + int from, target_pstate; struct sample *sample; from = cpu->pstate.current_pstate; - pid = &cpu->pid; - busy_scaled = intel_pstate_get_scaled_busy(cpu); + target_pstate = pstate_funcs.get_target_pstate(cpu); - ctl = pid_calc(pid, busy_scaled); - - /* Negative values of ctl increase the pstate and vice versa */ - intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl, true); + intel_pstate_set_pstate(cpu, target_pstate, true); sample = &cpu->sample; trace_pstate_sample(fp_toint(sample->core_pct_busy), - fp_toint(busy_scaled), + fp_toint(sample->busy_scaled), from, cpu->pstate.current_pstate, sample->mperf, @@ -1237,6 +1239,8 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_scaling = funcs->get_scaling; pstate_funcs.set = funcs->set; pstate_funcs.get_vid = funcs->get_vid; + pstate_funcs.get_target_pstate = funcs->get_target_pstate; + } #if IS_ENABLED(CONFIG_ACPI) -- cgit v1.2.3 From e70eed2b64545ab5c9d2f4d43372d79762f1b985 Mon Sep 17 00:00:00 2001 From: Philippe Longepe Date: Fri, 4 Dec 2015 17:40:32 +0100 Subject: cpufreq: intel_pstate: Account for non C0 time The current function to calculate cpu utilization uses the average P-state ratio (APerf/Mperf) scaled by the ratio of the current P-state to the max available non-turbo one. This leads to an overestimation of utilization which causes higher-performance P-states to be selected more often and that leads to increased energy consumption. This is a problem for low-power systems, so it is better to use a different utilization calculation algorithm for them. Namely, the Percent Busy value (or load) can be estimated as the ratio of the MPERF counter that runs at a constant rate only during active periods (C0) to the time stamp counter (TSC) that also runs (at the same rate) during idle. That is: Percent Busy = 100 * (delta_mperf / delta_tsc) Use this algorithm for platforms with SoCs based on the Airmont and Silvermont Atom cores. Signed-off-by: Philippe Longepe Signed-off-by: Stephane Gasparini Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index ff58029a56e2..8bfebaeda2dd 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -143,6 +143,7 @@ struct cpu_defaults { }; static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu); +static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu); static struct pstate_adjust_policy pid_params; static struct pstate_funcs pstate_funcs; @@ -763,7 +764,7 @@ static struct cpu_defaults silvermont_params = { .set = atom_set_pstate, .get_scaling = silvermont_get_scaling, .get_vid = atom_get_vid, - .get_target_pstate = get_target_pstate_use_performance, + .get_target_pstate = get_target_pstate_use_cpu_load, }, }; @@ -784,7 +785,7 @@ static struct cpu_defaults airmont_params = { .set = atom_set_pstate, .get_scaling = airmont_get_scaling, .get_vid = atom_get_vid, - .get_target_pstate = get_target_pstate_use_performance, + .get_target_pstate = get_target_pstate_use_cpu_load, }, }; @@ -890,12 +891,11 @@ static inline void intel_pstate_sample(struct cpudata *cpu) local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); - if (cpu->prev_mperf == mperf) { + tsc = rdtsc(); + if ((cpu->prev_mperf == mperf) || (cpu->prev_tsc == tsc)) { local_irq_restore(flags); return; } - - tsc = rdtsc(); local_irq_restore(flags); cpu->last_sample_time = cpu->sample.time; @@ -930,6 +930,25 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu) mod_timer_pinned(&cpu->timer, jiffies + delay); } +static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) +{ + struct sample *sample = &cpu->sample; + int32_t cpu_load; + + /* + * The load can be estimated as the ratio of the mperf counter + * running at a constant frequency during active periods + * (C0) and the time stamp counter running at the same frequency + * also during C-states. + */ + cpu_load = div64_u64(int_tofp(100) * sample->mperf, sample->tsc); + + cpu->sample.busy_scaled = cpu_load; + + return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load); +} + + static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) { int32_t core_busy, max_pstate, current_pstate, sample_ratio; -- cgit v1.2.3 From 63d1d656a5232f2f189b217b50542eadcf9d74ae Mon Sep 17 00:00:00 2001 From: Philippe Longepe Date: Fri, 4 Dec 2015 17:40:35 +0100 Subject: cpufreq: intel_pstate: Account for IO wait time In cases where we have many IOs, the global load becomes low and the load algorithm will decrease the requested P-State. Because of that, the IOs overheads will increase and impact the IO performances. To improve IO bound work, we can count the io-wait time as busy time in calculating CPU busy. This change uses get_cpu_iowait_time_us() to obtain the IO wait time value and converts time into number of cycles spent waiting on IO at the TSC rate. At the moment, this trick is only used for Atom. Signed-off-by: Philippe Longepe Signed-off-by: Stephane Gasparini Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 8bfebaeda2dd..efc581392bd1 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -113,6 +113,7 @@ struct cpudata { u64 prev_aperf; u64 prev_mperf; u64 prev_tsc; + u64 prev_cummulative_iowait; struct sample sample; }; @@ -933,22 +934,39 @@ static inline void intel_pstate_set_sample_time(struct cpudata *cpu) static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) { struct sample *sample = &cpu->sample; + u64 cummulative_iowait, delta_iowait_us; + u64 delta_iowait_mperf; + u64 mperf, now; int32_t cpu_load; + cummulative_iowait = get_cpu_iowait_time_us(cpu->cpu, &now); + + /* + * Convert iowait time into number of IO cycles spent at max_freq. + * IO is considered as busy only for the cpu_load algorithm. For + * performance this is not needed since we always try to reach the + * maximum P-State, so we are already boosting the IOs. + */ + delta_iowait_us = cummulative_iowait - cpu->prev_cummulative_iowait; + delta_iowait_mperf = div64_u64(delta_iowait_us * cpu->pstate.scaling * + cpu->pstate.max_pstate, MSEC_PER_SEC); + + mperf = cpu->sample.mperf + delta_iowait_mperf; + cpu->prev_cummulative_iowait = cummulative_iowait; + + /* * The load can be estimated as the ratio of the mperf counter * running at a constant frequency during active periods * (C0) and the time stamp counter running at the same frequency * also during C-states. */ - cpu_load = div64_u64(int_tofp(100) * sample->mperf, sample->tsc); - + cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc); cpu->sample.busy_scaled = cpu_load; return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load); } - static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) { int32_t core_busy, max_pstate, current_pstate, sample_ratio; -- cgit v1.2.3 From 89b56047f6f9b15fa3e9df3e34fa391835972ab7 Mon Sep 17 00:00:00 2001 From: Pi-Cheng Chen Date: Thu, 10 Dec 2015 11:48:13 +0800 Subject: cpufreq: mt8173: Move resources allocation into ->probe() Since the return value of ->init() of cpufreq driver is not propagated to the device driver model now, move resources allocation into ->probe() to handle -EPROBE_DEFER properly. Signed-off-by: Pi-Cheng Chen Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/mt8173-cpufreq.c | 92 +++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 24 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c index 9d0fe37b4c3e..fd601b92f5ec 100644 --- a/drivers/cpufreq/mt8173-cpufreq.c +++ b/drivers/cpufreq/mt8173-cpufreq.c @@ -41,16 +41,35 @@ * the original PLL becomes stable at target frequency. */ struct mtk_cpu_dvfs_info { + struct cpumask cpus; struct device *cpu_dev; struct regulator *proc_reg; struct regulator *sram_reg; struct clk *cpu_clk; struct clk *inter_clk; struct thermal_cooling_device *cdev; + struct list_head list_head; int intermediate_voltage; bool need_voltage_tracking; }; +static LIST_HEAD(dvfs_info_list); + +static struct mtk_cpu_dvfs_info *mtk_cpu_dvfs_info_lookup(int cpu) +{ + struct mtk_cpu_dvfs_info *info; + struct list_head *list; + + list_for_each(list, &dvfs_info_list) { + info = list_entry(list, struct mtk_cpu_dvfs_info, list_head); + + if (cpumask_test_cpu(cpu, &info->cpus)) + return info; + } + + return NULL; +} + static int mtk_cpufreq_voltage_tracking(struct mtk_cpu_dvfs_info *info, int new_vproc) { @@ -402,6 +421,9 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu) */ info->need_voltage_tracking = !IS_ERR(sram_reg); + /* CPUs in the same cluster share a clock and power domain. */ + cpumask_copy(&info->cpus, &cpu_topology[cpu].core_sibling); + return 0; out_free_opp_table: @@ -440,22 +462,18 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy) struct cpufreq_frequency_table *freq_table; int ret; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - ret = mtk_cpu_dvfs_info_init(info, policy->cpu); - if (ret) { - pr_err("%s failed to initialize dvfs info for cpu%d\n", - __func__, policy->cpu); - goto out_free_dvfs_info; + info = mtk_cpu_dvfs_info_lookup(policy->cpu); + if (!info) { + pr_err("dvfs info for cpu%d is not initialized.\n", + policy->cpu); + return -EINVAL; } ret = dev_pm_opp_init_cpufreq_table(info->cpu_dev, &freq_table); if (ret) { pr_err("failed to init cpufreq table for cpu%d: %d\n", policy->cpu, ret); - goto out_release_dvfs_info; + return ret; } ret = cpufreq_table_validate_and_show(policy, freq_table); @@ -464,8 +482,7 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy) goto out_free_cpufreq_table; } - /* CPUs in the same cluster share a clock and power domain. */ - cpumask_copy(policy->cpus, &cpu_topology[policy->cpu].core_sibling); + cpumask_copy(policy->cpus, &info->cpus); policy->driver_data = info; policy->clk = info->cpu_clk; @@ -473,13 +490,6 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy) out_free_cpufreq_table: dev_pm_opp_free_cpufreq_table(info->cpu_dev, &freq_table); - -out_release_dvfs_info: - mtk_cpu_dvfs_info_release(info); - -out_free_dvfs_info: - kfree(info); - return ret; } @@ -489,8 +499,6 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy) cpufreq_cooling_unregister(info->cdev); dev_pm_opp_free_cpufreq_table(info->cpu_dev, &policy->freq_table); - mtk_cpu_dvfs_info_release(info); - kfree(info); return 0; } @@ -510,11 +518,47 @@ static struct cpufreq_driver mt8173_cpufreq_driver = { static int mt8173_cpufreq_probe(struct platform_device *pdev) { - int ret; + struct mtk_cpu_dvfs_info *info; + struct list_head *list, *tmp; + int cpu, ret; + + for_each_possible_cpu(cpu) { + info = mtk_cpu_dvfs_info_lookup(cpu); + if (info) + continue; + + info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto release_dvfs_info_list; + } + + ret = mtk_cpu_dvfs_info_init(info, cpu); + if (ret) { + dev_err(&pdev->dev, + "failed to initialize dvfs info for cpu%d\n", + cpu); + goto release_dvfs_info_list; + } + + list_add(&info->list_head, &dvfs_info_list); + } ret = cpufreq_register_driver(&mt8173_cpufreq_driver); - if (ret) - pr_err("failed to register mtk cpufreq driver\n"); + if (ret) { + dev_err(&pdev->dev, "failed to register mtk cpufreq driver\n"); + goto release_dvfs_info_list; + } + + return 0; + +release_dvfs_info_list: + list_for_each_safe(list, tmp, &dvfs_info_list) { + info = list_entry(list, struct mtk_cpu_dvfs_info, list_head); + + mtk_cpu_dvfs_info_release(info); + list_del(list); + } return ret; } -- cgit v1.2.3 From ab0ea257fc58d8742f73f50fba3797dfe001aa3c Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 10 Dec 2015 09:42:16 +0000 Subject: cpufreq: st: Provide runtime initialised driver for ST's platforms The bootloader is charged with the responsibility to provide platform specific Dynamic Voltage and Frequency Scaling (DVFS) information via Device Tree. This driver takes the supplied configuration and registers it with the new generic OPP framework, to then be used with CPUFreq. Acked-by: Viresh Kumar Signed-off-by: Lee Jones Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.arm | 10 ++ drivers/cpufreq/Makefile | 1 + drivers/cpufreq/sti-cpufreq.c | 294 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 305 insertions(+) create mode 100644 drivers/cpufreq/sti-cpufreq.c (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 80fbfb32b5a9..ff9be3661480 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -219,6 +219,16 @@ config ARM_SPEAR_CPUFREQ help This adds the CPUFreq driver support for SPEAr SOCs. +config ARM_STI_CPUFREQ + tristate "STi CPUFreq support" + depends on SOC_STIH407 + help + This driver uses the generic OPP framework to match the running + platform with a predefined set of suitable values. If not provided + we will fall-back so safe-values contained in Device Tree. Enable + this config option if you wish to add CPUFreq support for STi based + SoCs. + config ARM_TEGRA20_CPUFREQ bool "Tegra20 CPUFreq support" depends on ARCH_TEGRA diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index c0af1a1281c8..9e63fb1b09f8 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -73,6 +73,7 @@ obj-$(CONFIG_ARM_SA1100_CPUFREQ) += sa1100-cpufreq.o obj-$(CONFIG_ARM_SA1110_CPUFREQ) += sa1110-cpufreq.o obj-$(CONFIG_ARM_SCPI_CPUFREQ) += scpi-cpufreq.o obj-$(CONFIG_ARM_SPEAR_CPUFREQ) += spear-cpufreq.o +obj-$(CONFIG_ARM_STI_CPUFREQ) += sti-cpufreq.o obj-$(CONFIG_ARM_TEGRA20_CPUFREQ) += tegra20-cpufreq.o obj-$(CONFIG_ARM_TEGRA124_CPUFREQ) += tegra124-cpufreq.o obj-$(CONFIG_ARM_VEXPRESS_SPC_CPUFREQ) += vexpress-spc-cpufreq.o diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c new file mode 100644 index 000000000000..a9c659f58974 --- /dev/null +++ b/drivers/cpufreq/sti-cpufreq.c @@ -0,0 +1,294 @@ +/* + * Match running platform with pre-defined OPP values for CPUFreq + * + * Author: Ajit Pal Singh + * Lee Jones + * + * Copyright (C) 2015 STMicroelectronics (R&D) Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the version 2 of the GNU General Public License as + * published by the Free Software Foundation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define VERSION_ELEMENTS 3 +#define MAX_PCODE_NAME_LEN 7 + +#define VERSION_SHIFT 28 +#define HW_INFO_INDEX 1 +#define MAJOR_ID_INDEX 1 +#define MINOR_ID_INDEX 2 + +/* + * Only match on "suitable for ALL versions" entries + * + * This will be used with the BIT() macro. It sets the + * top bit of a 32bit value and is equal to 0x80000000. + */ +#define DEFAULT_VERSION 31 + +enum { + PCODE = 0, + SUBSTRATE, + DVFS_MAX_REGFIELDS, +}; + +/** + * ST CPUFreq Driver Data + * + * @cpu_node CPU's OF node + * @syscfg_eng Engineering Syscon register map + * @regmap Syscon register map + */ +static struct sti_cpufreq_ddata { + struct device *cpu; + struct regmap *syscfg_eng; + struct regmap *syscfg; +} ddata; + +static int sti_cpufreq_fetch_major(void) { + struct device_node *np = ddata.cpu->of_node; + struct device *dev = ddata.cpu; + unsigned int major_offset; + unsigned int socid; + int ret; + + ret = of_property_read_u32_index(np, "st,syscfg", + MAJOR_ID_INDEX, &major_offset); + if (ret) { + dev_err(dev, "No major number offset provided in %s [%d]\n", + np->full_name, ret); + return ret; + } + + ret = regmap_read(ddata.syscfg, major_offset, &socid); + if (ret) { + dev_err(dev, "Failed to read major number from syscon [%d]\n", + ret); + return ret; + } + + return ((socid >> VERSION_SHIFT) & 0xf) + 1; +} + +static int sti_cpufreq_fetch_minor(void) +{ + struct device *dev = ddata.cpu; + struct device_node *np = dev->of_node; + unsigned int minor_offset; + unsigned int minid; + int ret; + + ret = of_property_read_u32_index(np, "st,syscfg-eng", + MINOR_ID_INDEX, &minor_offset); + if (ret) { + dev_err(dev, + "No minor number offset provided %s [%d]\n", + np->full_name, ret); + return ret; + } + + ret = regmap_read(ddata.syscfg_eng, minor_offset, &minid); + if (ret) { + dev_err(dev, + "Failed to read the minor number from syscon [%d]\n", + ret); + return ret; + } + + return minid & 0xf; +} + +static int sti_cpufreq_fetch_regmap_field(const struct reg_field *reg_fields, + int hw_info_offset, int field) +{ + struct regmap_field *regmap_field; + struct reg_field reg_field = reg_fields[field]; + struct device *dev = ddata.cpu; + unsigned int value; + int ret; + + reg_field.reg = hw_info_offset; + regmap_field = devm_regmap_field_alloc(dev, + ddata.syscfg_eng, + reg_field); + if (IS_ERR(regmap_field)) { + dev_err(dev, "Failed to allocate reg field\n"); + return PTR_ERR(regmap_field); + } + + ret = regmap_field_read(regmap_field, &value); + if (ret) { + dev_err(dev, "Failed to read %s code\n", + field ? "SUBSTRATE" : "PCODE"); + return ret; + } + + return value; +} + +static const struct reg_field sti_stih407_dvfs_regfields[DVFS_MAX_REGFIELDS] = { + [PCODE] = REG_FIELD(0, 16, 19), + [SUBSTRATE] = REG_FIELD(0, 0, 2), +}; + +static const struct reg_field *sti_cpufreq_match(void) +{ + if (of_machine_is_compatible("st,stih407") || + of_machine_is_compatible("st,stih410")) + return sti_stih407_dvfs_regfields; + + return NULL; +} + +static int sti_cpufreq_set_opp_info(void) +{ + struct device *dev = ddata.cpu; + struct device_node *np = dev->of_node; + const struct reg_field *reg_fields; + unsigned int hw_info_offset; + unsigned int version[VERSION_ELEMENTS]; + int pcode, substrate, major, minor; + int ret; + char name[MAX_PCODE_NAME_LEN]; + + reg_fields = sti_cpufreq_match(); + if (!reg_fields) { + dev_err(dev, "This SoC doesn't support voltage scaling"); + return -ENODEV; + } + + ret = of_property_read_u32_index(np, "st,syscfg-eng", + HW_INFO_INDEX, &hw_info_offset); + if (ret) { + dev_warn(dev, "Failed to read HW info offset from DT\n"); + substrate = DEFAULT_VERSION; + pcode = 0; + goto use_defaults; + } + + pcode = sti_cpufreq_fetch_regmap_field(reg_fields, + hw_info_offset, + PCODE); + if (pcode < 0) { + dev_warn(dev, "Failed to obtain process code\n"); + /* Use default pcode */ + pcode = 0; + } + + substrate = sti_cpufreq_fetch_regmap_field(reg_fields, + hw_info_offset, + SUBSTRATE); + if (substrate) { + dev_warn(dev, "Failed to obtain substrate code\n"); + /* Use default substrate */ + substrate = DEFAULT_VERSION; + } + +use_defaults: + major = sti_cpufreq_fetch_major(); + if (major < 0) { + dev_err(dev, "Failed to obtain major version\n"); + /* Use default major number */ + major = DEFAULT_VERSION; + } + + minor = sti_cpufreq_fetch_minor(); + if (minor < 0) { + dev_err(dev, "Failed to obtain minor version\n"); + /* Use default minor number */ + minor = DEFAULT_VERSION; + } + + snprintf(name, MAX_PCODE_NAME_LEN, "pcode%d", pcode); + + ret = dev_pm_opp_set_prop_name(dev, name); + if (ret) { + dev_err(dev, "Failed to set prop name\n"); + return ret; + } + + version[0] = BIT(major); + version[1] = BIT(minor); + version[2] = BIT(substrate); + + ret = dev_pm_opp_set_supported_hw(dev, version, VERSION_ELEMENTS); + if (ret) { + dev_err(dev, "Failed to set supported hardware\n"); + return ret; + } + + dev_dbg(dev, "pcode: %d major: %d minor: %d substrate: %d\n", + pcode, major, minor, substrate); + dev_dbg(dev, "version[0]: %x version[1]: %x version[2]: %x\n", + version[0], version[1], version[2]); + + return 0; +} + +static int sti_cpufreq_fetch_syscon_regsiters(void) +{ + struct device *dev = ddata.cpu; + struct device_node *np = dev->of_node; + + ddata.syscfg = syscon_regmap_lookup_by_phandle(np, "st,syscfg"); + if (IS_ERR(ddata.syscfg)) { + dev_err(dev, "\"st,syscfg\" not supplied\n"); + return PTR_ERR(ddata.syscfg); + } + + ddata.syscfg_eng = syscon_regmap_lookup_by_phandle(np, "st,syscfg-eng"); + if (IS_ERR(ddata.syscfg_eng)) { + dev_err(dev, "\"st,syscfg-eng\" not supplied\n"); + return PTR_ERR(ddata.syscfg_eng); + } + + return 0; +} + +static int sti_cpufreq_init(void) +{ + int ret; + + ddata.cpu = get_cpu_device(0); + if (!ddata.cpu) { + dev_err(ddata.cpu, "Failed to get device for CPU0\n"); + goto skip_voltage_scaling; + } + + if (!of_get_property(ddata.cpu->of_node, "operating-points-v2", NULL)) { + dev_err(ddata.cpu, "OPP-v2 not supported\n"); + goto skip_voltage_scaling; + } + + ret = sti_cpufreq_fetch_syscon_regsiters(); + if (ret) + goto skip_voltage_scaling; + + ret = sti_cpufreq_set_opp_info(); + if (!ret) + goto register_cpufreq_dt; + +skip_voltage_scaling: + dev_err(ddata.cpu, "Not doing voltage scaling\n"); + +register_cpufreq_dt: + platform_device_register_simple("cpufreq-dt", -1, NULL, 0); + + return 0; +} +module_init(sti_cpufreq_init); + +MODULE_DESCRIPTION("STMicroelectronics CPUFreq/OPP driver"); +MODULE_AUTHOR("Ajitpal Singh "); +MODULE_AUTHOR("Lee Jones "); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3