summaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c651
1 files changed, 459 insertions, 192 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..f4f6a8326dd0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 3;
+static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock;
+ u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock;
+ se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
@@ -1313,7 +1313,7 @@ static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int load_idx)
{
- struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (local_group) {
this_load = avg_load;
- this = group;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
+
+ /* re-arm NEWIDLE balancing when moving tasks */
+ src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+ this_rq->idle_stamp = 0;
}
/*
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq->clock, sd);
+ tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2034,14 @@ struct sd_lb_stats {
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
+ unsigned long this_has_capacity;
/* Statistics of the busiest group */
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
+ unsigned long busiest_has_capacity;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2059,6 +2064,7 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
int group_imb; /* Is there an imbalance in the group ? */
+ int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu)
struct rq *rq = cpu_rq(cpu);
u64 total, available;
- sched_avg_update(rq);
-
total = sched_avg_period() + (rq->clock - rq->age_stamp);
- available = total - rq->rt_avg;
+
+ if (unlikely(total < rq->rt_avg)) {
+ /* Ensures that power won't end up being negative */
+ available = 0;
+ } else {
+ available = total - rq->rt_avg;
+ }
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
@@ -2287,13 +2297,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
- else
- power *= default_scale_freq_power(sd, cpu);
-
- power >>= SCHED_LOAD_SHIFT;
-
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2306,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power >>= SCHED_LOAD_SHIFT;
}
+ sdg->cpu_power_orig = power;
+
+ if (sched_feat(ARCH_POWER))
+ power *= arch_scale_freq_power(sd, cpu);
+ else
+ power *= default_scale_freq_power(sd, cpu);
+
+ power >>= SCHED_LOAD_SHIFT;
+
power *= scale_rt_power(cpu);
power >>= SCHED_LOAD_SHIFT;
@@ -2335,6 +2347,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
sdg->cpu_power = power;
}
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+ /*
+ * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ */
+ if (sd->level != SD_LV_SIBLING)
+ return 0;
+
+ /*
+ * If ~90% of the cpu_power is still there, we're good.
+ */
+ if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ return 1;
+
+ return 0;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @sd: The sched_domain whose statistics are to be updated.
@@ -2354,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
@@ -2365,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
+ max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
@@ -2382,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
+ if (load > max_cpu_load) {
max_cpu_load = load;
+ max_nr_running = rq->nr_running;
+ }
if (min_cpu_load > load)
min_cpu_load = load;
}
@@ -2400,14 +2440,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu) {
- *balance = 0;
- return;
+ if (idle != CPU_NEWLY_IDLE && local_group) {
+ if (balance_cpu != this_cpu) {
+ *balance = 0;
+ return;
+ }
+ update_group_power(sd, this_cpu);
}
- update_group_power(sd, this_cpu);
-
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
@@ -2423,11 +2463,58 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity =
- DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ if (!sgs->group_capacity)
+ sgs->group_capacity = fix_small_capacity(sd, group);
+
+ if (sgs->group_capacity > sgs->sum_nr_running)
+ sgs->group_has_capacity = 1;
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs,
+ int this_cpu)
+{
+ if (sgs->avg_load <= sds->max_load)
+ return false;
+
+ if (sgs->sum_nr_running > sgs->group_capacity)
+ return true;
+
+ if (sgs->group_imb)
+ return true;
+
+ /*
+ * ASYM_PACKING needs to move all the work to the lowest
+ * numbered CPUs in the group, therefore mark all groups
+ * higher than ourself as busy.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ this_cpu < group_first_cpu(sg)) {
+ if (!sds->busiest)
+ return true;
+
+ if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ return true;
+ }
+
+ return false;
}
/**
@@ -2435,7 +2522,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2533,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
- struct sched_group *group = sd->groups;
+ struct sched_group *sg = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
@@ -2459,45 +2546,100 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
do {
int local_group;
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+ update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += group->cpu_power;
+ sds->total_pwr += sg->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the group capacity to one so that we'll try
- * and move all the excess tasks away.
+ * first, lower the sg capacity to one so that we'll try
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks, i.e. nr_running < group_capacity. The
+ * extra check prevents the case where you always pull from the
+ * heaviest group when it is already under-utilized (possible
+ * with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling)
+ if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
sds->this_load = sgs.avg_load;
- sds->this = group;
+ sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
- } else if (sgs.avg_load > sds->max_load &&
- (sgs.sum_nr_running > sgs.group_capacity ||
- sgs.group_imb)) {
+ sds->this_has_capacity = sgs.group_has_capacity;
+ } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
- sds->busiest = group;
+ sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
- update_sd_power_savings_stats(group, sds, local_group, &sgs);
- group = group->next;
- } while (group != sd->groups);
+ update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+ sg = sg->next;
+ } while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ * sched doman.
+ *
+ * This is primarily intended to used at the sibling level. Some
+ * cores like POWER7 prefer to use lower numbered SMT threads. In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle. When in lower SMT modes, the threads will
+ * perform better since they share less core resources. Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads. It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on. Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU. The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ int busiest_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return 0;
+
+ if (!sds->busiest)
+ return 0;
+
+ busiest_cpu = group_first_cpu(sds->busiest);
+ if (this_cpu > busiest_cpu)
+ return 0;
+
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ SCHED_LOAD_SCALE);
+ return 1;
}
/**
@@ -2637,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
return fix_small_imbalance(sds, this_cpu, imbalance);
}
+
/******* find_busiest_group() helpers end here *********************/
/**
@@ -2688,13 +2831,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
+ *
+ * Note: when doing newidle balance, if the local group has excess
+ * capacity (i.e. nr_running < group_capacity) and the busiest group
+ * does not have any capacity, we force a load balance to pull tasks
+ * to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
+ if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(sd, &sds, this_cpu, imbalance))
+ return sds.busiest;
+
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ !sds.busiest_has_capacity)
+ goto force_balance;
+
if (sds.this_load >= sds.max_load)
goto out_balanced;
@@ -2706,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
+force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
@@ -2726,8 +2884,9 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+ enum cpu_idle_type idle, unsigned long imbalance,
+ const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -2738,6 +2897,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
unsigned long wl;
+ if (!capacity)
+ capacity = fix_small_capacity(sd, group);
+
if (!cpumask_test_cpu(i, cpus))
continue;
@@ -2777,9 +2939,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+ int busiest_cpu, int this_cpu)
{
if (idle == CPU_NEWLY_IDLE) {
+
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * higher numbered CPUs in order to pack all tasks in the
+ * lowest numbered CPUs.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ return 1;
+
/*
* The only task running in a non-idle cpu can be moved to this
* cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +3026,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2896,9 +3068,17 @@ redo:
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
- sd->nr_balance_failed++;
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
- if (need_active_balance(sd, sd_idle, idle)) {
+ if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+ this_cpu)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -3017,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ if (pulled_task)
break;
- }
}
raw_spin_lock(&this_rq->lock);
@@ -3093,13 +3271,40 @@ out_unlock:
}
#ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+ csd->func = trigger_sched_softirq;
+ csd->info = NULL;
+ csd->flags = 0;
+ csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ * entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ * it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
+ */
static struct {
atomic_t load_balancer;
- cpumask_var_t cpu_mask;
- cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
- .load_balancer = ATOMIC_INIT(-1),
-};
+ atomic_t first_pick_cpu;
+ atomic_t second_pick_cpu;
+ cpumask_var_t idle_cpus_mask;
+ cpumask_var_t grp_idle_mask;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
int get_nohz_load_balancer(void)
{
@@ -3153,17 +3358,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
*/
static inline int is_semi_idle_group(struct sched_group *ilb_group)
{
- cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
sched_group_cpus(ilb_group));
/*
* A sched_group is semi-idle when it has atleast one busy cpu
* and atleast one idle cpu.
*/
- if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ if (cpumask_empty(nohz.grp_idle_mask))
return 0;
- if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
return 0;
return 1;
@@ -3196,7 +3401,7 @@ static int find_new_ilb(int cpu)
* Optimize for the case when we have no idle CPUs or only one
* idle CPU. Don't walk the sched_domain hierarchy in such cases
*/
- if (cpumask_weight(nohz.cpu_mask) < 2)
+ if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3409,7 @@ static int find_new_ilb(int cpu)
do {
if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.ilb_grp_nohz_mask);
+ return cpumask_first(nohz.grp_idle_mask);
ilb_group = ilb_group->next;
@@ -3212,98 +3417,116 @@ static int find_new_ilb(int cpu)
}
out_done:
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
{
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#endif
/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+ int ilb_cpu;
+
+ nohz.next_balance++;
+
+ ilb_cpu = get_nohz_load_balancer();
+
+ if (ilb_cpu >= nr_cpu_ids) {
+ ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
+ }
+
+ if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+ struct call_single_data *cp;
+
+ cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ cp = &per_cpu(remote_sched_softirq_cb, cpu);
+ __smp_call_function_single(ilb_cpu, cp, 0);
+ }
+ return;
+}
+
+/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
*
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
*
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
*/
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
if (stop_tick) {
- cpu_rq(cpu)->in_nohz_recently = 1;
-
if (!cpu_active(cpu)) {
if (atomic_read(&nohz.load_balancer) != cpu)
- return 0;
+ return;
/*
* If we are going offline and still the leader,
* give up!
*/
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
- return 0;
+ return;
}
- cpumask_set_cpu(cpu, nohz.cpu_mask);
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- /* time for ilb owner also to sleep */
- if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
- if (atomic_read(&nohz.load_balancer) == cpu)
- atomic_set(&nohz.load_balancer, -1);
- return 0;
- }
+ if (atomic_read(&nohz.first_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+ if (atomic_read(&nohz.second_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.load_balancer) == -1) {
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
- return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
int new_ilb;
- if (!(sched_smt_power_savings ||
- sched_mc_power_savings))
- return 1;
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ cpu) != nr_cpu_ids)
+ return;
+
/*
* Check to see if there is a more power-efficient
* ilb.
*/
new_ilb = find_new_ilb(cpu);
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, -1);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
resched_cpu(new_ilb);
- return 0;
+ return;
}
- return 1;
+ return;
}
} else {
- if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
- return 0;
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return;
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
}
- return 0;
+ return;
}
#endif
@@ -3385,11 +3608,102 @@ out:
rq->next_balance = next_balance;
}
+#ifdef CONFIG_NO_HZ
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+ struct rq *rq;
+ int balance_cpu;
+
+ if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+ return;
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu)
+ continue;
+
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched()) {
+ this_rq->nohz_balance_kick = 0;
+ break;
+ }
+
+ raw_spin_lock_irq(&this_rq->lock);
+ update_rq_clock(this_rq);
+ update_cpu_load(this_rq);
+ raw_spin_unlock_irq(&this_rq->lock);
+
+ rebalance_domains(balance_cpu, CPU_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
+ }
+ nohz.next_balance = this_rq->next_balance;
+ this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ * idle load balancer when it has more than one process active. This
+ * eliminates the need for idle load balancing altogether when we have
+ * only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ * to run for active_load_balance to happen (i.e., two busy CPUs are
+ * SMT or core siblings and can run better if they move to different
+ * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ * which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+ unsigned long now = jiffies;
+ int ret;
+ int first_pick_cpu, second_pick_cpu;
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (rq->idle_at_tick)
+ return 0;
+
+ first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+ second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+ if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+ second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ return 0;
+
+ ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ if (rq->nr_running > 1)
+ return 1;
+ } else {
+ ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ if (rq->nr_running)
+ return 1;
+ }
+ }
+ return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
@@ -3399,37 +3713,12 @@ static void run_rebalance_domains(struct softirq_action *h)
rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
/*
- * If this cpu is the owner for idle load balancing, then do the
+ * If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- if (this_rq->idle_at_tick &&
- atomic_read(&nohz.load_balancer) == this_cpu) {
- struct rq *rq;
- int balance_cpu;
-
- for_each_cpu(balance_cpu, nohz.cpu_mask) {
- if (balance_cpu == this_cpu)
- continue;
-
- /*
- * If this cpu gets work to do, stop the load balancing
- * work being done for other cpus. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
-
- rebalance_domains(balance_cpu, CPU_IDLE);
-
- rq = cpu_rq(balance_cpu);
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
- }
- }
-#endif
+ nohz_idle_balance(this_cpu, idle);
}
static inline int on_null_domain(int cpu)
@@ -3439,57 +3728,17 @@ static inline int on_null_domain(int cpu)
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
-#ifdef CONFIG_NO_HZ
- /*
- * If we were in the nohz mode recently and busy at the current
- * scheduler tick, then check if we need to nominate new idle
- * load balancer.
- */
- if (rq->in_nohz_recently && !rq->idle_at_tick) {
- rq->in_nohz_recently = 0;
-
- if (atomic_read(&nohz.load_balancer) == cpu) {
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
- atomic_set(&nohz.load_balancer, -1);
- }
-
- if (atomic_read(&nohz.load_balancer) == -1) {
- int ilb = find_new_ilb(cpu);
-
- if (ilb < nr_cpu_ids)
- resched_cpu(ilb);
- }
- }
-
- /*
- * If this cpu is idle and doing idle load balancing for all the
- * cpus with ticks stopped, is it time for that to stop?
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
- resched_cpu(cpu);
- return;
- }
-
- /*
- * If this cpu is idle and the idle load balancing is done by
- * someone else, then no need raise the SCHED_SOFTIRQ
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- cpumask_test_cpu(cpu, nohz.cpu_mask))
- return;
-#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+ else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ nohz_balancer_kick(cpu);
+#endif
}
static void rq_online_fair(struct rq *rq)
@@ -3542,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p)
raw_spin_lock_irqsave(&rq->lock, flags);
- if (unlikely(task_cpu(p) != this_cpu))
+ update_rq_clock(rq);
+
+ if (unlikely(task_cpu(p) != this_cpu)) {
+ rcu_read_lock();
__set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+ }
update_curr(cfs_rq);
@@ -3615,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif
@@ -3673,7 +3940,7 @@ static const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};