From 994a8f2514e91c16616c4a1b53e9eb2b24de97b7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 21 Feb 2017 10:15:18 +0530 Subject: cpufreq: schedutil: Redefine the rate_limit_us tunable The rate_limit_us tunable is intended to reduce the possible overhead from running the schedutil governor. However, that overhead can be divided into two separate parts: the governor computations and the invocation of the scaling driver to set the CPU frequency. The latter is where the real overhead comes from. The former is much less expensive in terms of execution time and running it every time the governor callback is invoked by the scheduler, after rate_limit_us interval has passed since the last frequency update, would not be a problem. For this reason, redefine the rate_limit_us tunable so that it means the minimum time that has to pass between two consecutive invocations of the scaling driver by the schedutil governor (to set the CPU frequency). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index cd7cd489f739..78468aa051ab 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -93,14 +93,13 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, { struct cpufreq_policy *policy = sg_policy->policy; - sg_policy->last_freq_update_time = time; - if (policy->fast_switch_enabled) { if (sg_policy->next_freq == next_freq) { trace_cpu_frequency(policy->cur, smp_processor_id()); return; } sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; @@ -109,6 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, trace_cpu_frequency(next_freq, smp_processor_id()); } else if (sg_policy->next_freq != next_freq) { sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } -- cgit v1.2.3-55-g7522 From cba1dfb57b94c234728b689d9b00d4267fa1a879 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 9 Mar 2017 09:34:54 +0530 Subject: cpufreq: schedutil: Refactor sugov_next_freq_shared() The loop in sugov_next_freq_shared() contains an if block to skip the loop for the current CPU. This turns out to be an unnecessary conditional in the scheduler's hot-path for every CPU in the policy. It would be better to drop the conditional and make the loop treat all the CPUs in the same way. That would eliminate the need of calling sugov_iowait_boost() at the top of the routine. To keep the code optimized to return early if the current CPU has RT/DL flags set, move the flags check to sugov_update_shared() instead in order to avoid the function call entirely. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 78468aa051ab..f5ffe241812e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -217,30 +217,19 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned long util = 0, max = 1; unsigned int j; - if (flags & SCHED_CPUFREQ_RT_DL) - return max_f; - - sugov_iowait_boost(sg_cpu, &util, &max); - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu; + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; s64 delta_ns; - if (j == smp_processor_id()) - continue; - - j_sg_cpu = &per_cpu(sugov_cpu, j); /* * If the CPU utilization was last updated before the previous * frequency update and the time elapsed between the last update @@ -254,7 +243,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) - return max_f; + return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; @@ -289,7 +278,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + if (flags & SCHED_CPUFREQ_RT_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu); + sugov_update_commit(sg_policy, time, next_f); } -- cgit v1.2.3-55-g7522 From b7eaf1aab9f8bd2e49fceed77ebc66c1b5800718 Mon Sep 17 00:00:00 2001 From: Rafael J. Wysocki Date: Wed, 22 Mar 2017 00:08:50 +0100 Subject: cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely The way the schedutil governor uses the PELT metric causes it to underestimate the CPU utilization in some cases. That can be easily demonstrated by running kernel compilation on a Sandy Bridge Intel processor, running turbostat in parallel with it and looking at the values written to the MSR_IA32_PERF_CTL register. Namely, the expected result would be that when all CPUs were 100% busy, all of them would be requested to run in the maximum P-state, but observation shows that this clearly isn't the case. The CPUs run in the maximum P-state for a while and then are requested to run slower and go back to the maximum P-state after a while again. That causes the actual frequency of the processor to visibly oscillate below the sustainable maximum in a jittery fashion which clearly is not desirable. That has been attributed to CPU utilization metric updates on task migration that cause the total utilization value for the CPU to be reduced by the utilization of the migrated task. If that happens, the schedutil governor may see a CPU utilization reduction and will attempt to reduce the CPU frequency accordingly right away. That may be premature, though, for example if the system is generally busy and there are other runnable tasks waiting to be run on that CPU already. This is unlikely to be an issue on systems where cpufreq policies are shared between multiple CPUs, because in those cases the policy utilization is computed as the maximum of the CPU utilization values over the whole policy and if that turns out to be low, reducing the frequency for the policy most likely is a good idea anyway. On systems with one CPU per policy, however, it may affect performance adversely and even lead to increased energy consumption in some cases. On those systems it may be addressed by taking another utilization metric into consideration, like whether or not the CPU whose frequency is about to be reduced has been idle recently, because if that's not the case, the CPU is likely to be busy in the near future and its frequency should not be reduced. To that end, use the counter of idle calls in the timekeeping code. Namely, make the schedutil governor look at that counter for the current CPU every time before its frequency is about to be reduced. If the counter has not changed since the previous iteration of the governor computations for that CPU, the CPU has been busy for all that time and its frequency should not be decreased, so if the new frequency would be lower than the one set previously, the governor will skip the frequency update. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Reviewed-by: Joel Fernandes --- include/linux/tick.h | 1 + kernel/sched/cpufreq_schedutil.c | 27 +++++++++++++++++++++++++++ kernel/time/tick-sched.c | 12 ++++++++++++ 3 files changed, 40 insertions(+) diff --git a/include/linux/tick.h b/include/linux/tick.h index a04fea19676f..fe01e68bf520 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); +extern unsigned long tick_nohz_get_idle_calls(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f5ffe241812e..c1ffb5dc8af6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -61,6 +61,11 @@ struct sugov_cpu { unsigned long util; unsigned long max; unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -192,6 +197,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, sg_cpu->iowait_boost >>= 1; } +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -200,6 +218,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; + bool busy; sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -207,12 +226,20 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; + busy = sugov_cpu_is_busy(sg_cpu); + if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) + next_f = sg_policy->next_freq; } sugov_update_commit(sg_policy, time, next_f); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7fe53be86077..64c97fc130c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -- cgit v1.2.3-55-g7522 From 38d4ea229d25d30be6bf41bcd6cd663a587866ca Mon Sep 17 00:00:00 2001 From: Rafael J. Wysocki Date: Wed, 22 Mar 2017 18:32:47 +0100 Subject: cpufreq: schedutil: Trace frequency only if it has changed sugov_update_commit() calls trace_cpu_frequency() to record the current CPU frequency if it has not changed in the fast switch case to prevent utilities from getting confused (they may report that the CPU is idle if the frequency has not been recorded for too long, for example). However, that may cause the tracepoint to be triggered quite often for no real reason (if the frequency doesn't change, we will not modify the last update time stamp and governor computations may run again shortly when that happens), so don't do that (arguably, it is done to work around a utilities bug anyway). That allows code duplication in sugov_update_commit() to be reduced somewhat too. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c1ffb5dc8af6..1054f868d95c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -98,22 +98,20 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, { struct cpufreq_policy *policy = sg_policy->policy; + if (sg_policy->next_freq == next_freq) + return; + + sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; + if (policy->fast_switch_enabled) { - if (sg_policy->next_freq == next_freq) { - trace_cpu_frequency(policy->cur, smp_processor_id()); - return; - } - sg_policy->next_freq = next_freq; - sg_policy->last_freq_update_time = time; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (sg_policy->next_freq != next_freq) { - sg_policy->next_freq = next_freq; - sg_policy->last_freq_update_time = time; + } else { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } -- cgit v1.2.3-55-g7522 From 39b64aa1c007b98727db9f501266454fa403166c Mon Sep 17 00:00:00 2001 From: Rafael J. Wysocki Date: Thu, 30 Mar 2017 23:36:41 +0200 Subject: cpufreq: schedutil: Reduce frequencies slower The schedutil governor reduces frequencies too fast in some situations which cases undesirable performance drops to appear. To address that issue, make schedutil reduce the frequency slower by setting it to the average of the value chosen during the previous iteration of governor computations and the new one coming from its frequency selection formula. Link: https://bugzilla.kernel.org/show_bug.cgi?id=194963 Reported-by: John Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 848cb47094cd..b1fedf9932d6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -101,6 +101,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sg_policy->next_freq == next_freq) return; + if (sg_policy->next_freq > next_freq) + next_freq = (sg_policy->next_freq + next_freq) >> 1; + sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; -- cgit v1.2.3-55-g7522 From 1b72e7fd304639f1cd49d1e11955c4974936d88c Mon Sep 17 00:00:00 2001 From: Rafael J. Wysocki Date: Tue, 11 Apr 2017 00:20:41 +0200 Subject: cpufreq: schedutil: Use policy-dependent transition delays Make the schedutil governor take the initial (default) value of the rate_limit_us sysfs attribute from the (new) transition_delay_us policy parameter (to be set by the scaling driver). That will allow scaling drivers to make schedutil use smaller default values of rate_limit_us and reduce the default average time interval between consecutive frequency changes. Make intel_pstate set transition_delay_us to 500. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/intel_pstate.c | 2 ++ include/linux/cpufreq.h | 7 +++++++ kernel/sched/cpufreq_schedutil.c | 15 ++++++++++----- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index c31b72b16c2b..b7de5bd76a31 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -41,6 +41,7 @@ #define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC) #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 +#define INTEL_CPUFREQ_TRANSITION_DELAY 500 #ifdef CONFIG_ACPI #include @@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) return ret; policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 87165f06a307..a5ce0bbeadb5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -120,6 +120,13 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; + /* + * Preferred average time interval between consecutive invocations of + * the driver to set the frequency for this policy. To be set by the + * scaling driver (0, which is the default, means no preference). + */ + unsigned int transition_delay_us; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; int cached_resolved_idx; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index b1fedf9932d6..76877a62b5fa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -494,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; struct sugov_tunables *tunables; - unsigned int lat; int ret = 0; /* State should be equivalent to EXIT */ @@ -533,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; + if (policy->transition_delay_us) { + tunables->rate_limit_us = policy->transition_delay_us; + } else { + unsigned int lat; + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + } policy->governor_data = sg_policy; sg_policy->tunables = tunables; -- cgit v1.2.3-55-g7522