From 3289bdb429884c0279bf9ab72dff7b934f19dfc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Apr 2015 13:19:42 +0200 Subject: sched: Move the loadavg code to a more obvious location I could not find the loadavg code.. turns out it was hidden in a file called proc.c. It further got mingled up with the cruft per rq load indexes (which we really want to get rid of). Move the per rq load indexes into the fair.c load-balance code (that's the only thing that uses them) and rename proc.c to loadavg.c so we can find it again. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Paul Gortmaker Cc: Thomas Gleixner [ Did minor cleanups to the code. ] Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 7 +- kernel/sched/fair.c | 183 ++++++++++++++++ kernel/sched/loadavg.c | 394 +++++++++++++++++++++++++++++++++ kernel/sched/proc.c | 584 ------------------------------------------------- kernel/sched/sched.h | 8 +- 6 files changed, 588 insertions(+), 590 deletions(-) create mode 100644 kernel/sched/loadavg.c delete mode 100644 kernel/sched/proc.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be87024875..67687973ce80 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fdf972d56f65..527fc28a737a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2397,9 +2397,9 @@ unsigned long nr_iowait_cpu(int cpu) void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -2497,6 +2497,7 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..4bc6013886ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4323,6 +4323,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* + * per rq 'load' arrray crap; XXX kill this. + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +static void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = this_rq->cfs.runnable_load_avg; + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); +} + /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c new file mode 100644 index 000000000000..ef7159012cf3 --- /dev/null +++ b/kernel/sched/loadavg.c @@ -0,0 +1,394 @@ +/* + * kernel/sched/loadavg.c + * + * This file contains the magic bits required to compute the global loadavg + * figure. Its a silly number but people think its important. We go through + * great pains to make it work on big machines and tickless kernels. + */ + +#include + +#include "sched.h" + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long)this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + * + * Called from the global timer code. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from scheduler_tick() to periodically update this CPU's + * active count. + */ +void calc_global_load_tick(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c deleted file mode 100644 index 8ecd552fe4f2..000000000000 --- a/kernel/sched/proc.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * kernel/sched/proc.c - * - * Kernel load calculations, forked from sched/core.c - */ - -#include - -#include "sched.h" - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - * nr_active = 0; - * for_each_possible_cpu(cpu) - * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach - * to calculating nr_active. - * - * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate - * to obtain the same result. See calc_load_fold_active(). - * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding - * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. - * - * This places an upper-bound on the IRQ-off latency of the machine. Then - * again, being late doesn't loose the delta, just wrecks the sample. - * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu - * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. - * - * This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -atomic_long_t calc_load_tasks; -unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - * - When we go NO_HZ idle during the window, we can negate our sample - * contribution, causing under-accounting. - * - * We avoid this by keeping two idle-delta counters and flipping them - * when the window starts, thus separating old and new NO_HZ load. - * - * The only trick is the slight shift in index flip for read vs write. - * - * 0s 5s 10s 15s - * +10 +10 +10 +10 - * |-|-----------|-|-----------|-|-----------|-| - * r:0 0 1 1 0 0 1 1 0 - * w:0 1 1 0 0 1 1 0 0 - * - * This ensures we'll fold the old idle contribution in this window while - * accumlating the new one. - * - * - When we wake up from NO_HZ idle during the window, we push up our - * contribution, since we effectively move our sample point to a known - * busy state. - * - * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which - * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ - int idx = calc_load_idx; - - /* - * See calc_global_nohz(), if we observe the new index, we also - * need to observe the new update time. - */ - smp_rmb(); - - /* - * If the folding window started, make sure we start writing in the - * next idle-delta. - */ - if (!time_before(jiffies, calc_load_update)) - idx++; - - return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ - return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ - struct rq *this_rq = this_rq(); - long delta; - - /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. - */ - delta = calc_load_fold_active(this_rq); - if (delta) { - int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); - } -} - -void calc_load_exit_idle(void) -{ - struct rq *this_rq = this_rq(); - - /* - * If we're still before the sample window, we're done. - */ - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - /* - * We woke inside or after the sample window, this means we're already - * accounted through the nohz accounting, so skip the entire deal and - * sync up for the next window. - */ - this_rq->calc_load_update = calc_load_update; - if (time_before(jiffies, this_rq->calc_load_update + 10)) - this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ - int idx = calc_load_read_idx(); - long delta = 0; - - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ - long delta, active, n; - - if (!time_before(jiffies, calc_load_update + 10)) { - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Flip the idle index... - * - * Make sure we first write the new time then flip the index, so that - * calc_load_write_idx() will see the new time when it reads the new - * index, this avoids a double flip messing things up. - */ - smp_wmb(); - calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active, delta; - - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; - - /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. - */ - calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ */ - -/* - * Called from scheduler_tick() - */ -void update_cpu_load_active(struct rq *this_rq) -{ - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); - - calc_load_account_active(this_rq); -} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e129993958..09ed26a89f31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); + +#ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +#else +static inline void update_cpu_load_active(struct rq *this_rq) { } +#endif /* * Helpers for converting nanosecond timing to jiffy resolution @@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); - extern void init_task_runnable_average(struct task_struct *p); static inline void add_nr_running(struct rq *rq, unsigned count) -- cgit v1.2.3-55-g7522