From 05484e0984487d42e97c417cbb0697fa9d16e7e9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:31 +0100 Subject: sched/topology: Add SD_ASYM_CPUCAPACITY flag detection The SD_ASYM_CPUCAPACITY sched_domain flag is supposed to mark the sched_domain in the hierarchy where all CPU capacities are visible for any CPU's point of view on asymmetric CPU capacity systems. The scheduler can then take to take capacity asymmetry into account when balancing at this level. It also serves as an indicator for how wide task placement heuristics have to search to consider all available CPU capacities as asymmetric systems might often appear symmetric at smallest level(s) of the sched_domain hierarchy. The flag has been around for while but so far only been set by out-of-tree code in Android kernels. One solution is to let each architecture provide the flag through a custom sched_domain topology array and associated mask and flag functions. However, SD_ASYM_CPUCAPACITY is special in the sense that it depends on the capacity and presence of all CPUs in the system, i.e. when hotplugging all CPUs out except those with one particular CPU capacity the flag should disappear even if the sched_domains don't collapse. Similarly, the flag is affected by cpusets where load-balancing is turned off. Detecting when the flags should be set therefore depends not only on topology information but also the cpuset configuration and hotplug state. The arch code doesn't have easy access to the cpuset configuration. Instead, this patch implements the flag detection in generic code where cpusets and hotplug state is already taken care of. All the arch is responsible for is to implement arch_scale_cpu_capacity() and force a full rebuild of the sched_domain hierarchy if capacities are updated, e.g. later in the boot process when cpufreq has initialized. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-2-git-send-email-morten.rasmussen@arm.com [ Fixed 'CPU' capitalization. ] Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 81 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 505a41c42b96..5c4d583d53ee 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1061,7 +1061,6 @@ static struct cpumask ***sched_domains_numa_masks; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1073,13 +1072,12 @@ static struct cpumask ***sched_domains_numa_masks; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -1100,6 +1098,9 @@ sd_init(struct sched_domain_topology_level *tl, "wrong sd_flags in topology description\n")) sd_flags &= ~TOPOLOGY_SD_FLAGS; + /* Apply detected topology flags */ + sd_flags |= dflags; + *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, @@ -1604,9 +1605,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); if (child) { sd->level = child->level + 1; @@ -1632,6 +1633,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve return sd; } +/* + * Find the sched_domain_topology_level where all CPU capacities are visible + * for all CPUs. + */ +static struct sched_domain_topology_level +*asym_cpu_capacity_level(const struct cpumask *cpu_map) +{ + int i, j, asym_level = 0; + bool asym = false; + struct sched_domain_topology_level *tl, *asym_tl = NULL; + unsigned long cap; + + /* Is there any asymmetry? */ + cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + + for_each_cpu(i, cpu_map) { + if (arch_scale_cpu_capacity(NULL, i) != cap) { + asym = true; + break; + } + } + + if (!asym) + return NULL; + + /* + * Examine topology from all CPU's point of views to detect the lowest + * sched_domain_topology_level where a highest capacity CPU is visible + * to everyone. + */ + for_each_cpu(i, cpu_map) { + unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + int tl_id = 0; + + for_each_sd_topology(tl) { + if (tl_id < asym_level) + goto next_level; + + for_each_cpu_and(j, tl->mask(i), cpu_map) { + unsigned long capacity; + + capacity = arch_scale_cpu_capacity(NULL, j); + + if (capacity <= max_capacity) + continue; + + max_capacity = capacity; + asym_level = tl_id; + asym_tl = tl; + } +next_level: + tl_id++; + } + } + + return asym_tl; +} + + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -1644,18 +1704,27 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; + struct sched_domain_topology_level *tl_asym; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; + tl_asym = asym_cpu_capacity_level(cpu_map); + /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); + int dflags = 0; + + if (tl == tl_asym) + dflags |= SD_ASYM_CPUCAPACITY; + + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) -- cgit v1.2.3-55-g7522 From df054e8445a4011e3d693c2268129c0456108663 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:39 +0100 Subject: sched/topology: Add static_key for asymmetric CPU capacity optimizations The existing asymmetric CPU capacity code should cause minimal overhead for others. Putting it behind a static_key, it has been done for SMT optimizations, would make it easier to extend and improve without causing harm to others moving forward. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 9 ++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f808ddf2a868..3e5071aeb117 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6188,6 +6188,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..0f36adc31ba5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1185,6 +1185,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { atomic_t ref; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5c4d583d53ee..b0cdf5e95bda 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { @@ -1705,6 +1706,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; + bool has_asym = false; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) @@ -1720,8 +1722,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_sd_topology(tl) { int dflags = 0; - if (tl == tl_asym) + if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; + has_asym = true; + } sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); @@ -1773,6 +1777,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } rcu_read_unlock(); + if (has_asym) + static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); -- cgit v1.2.3-55-g7522 From 3b1baa6496e6b7ad016342a9d256bdfb072ce902 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:40 +0100 Subject: sched/fair: Add 'group_misfit_task' load-balance type To maximize throughput in systems with asymmetric CPU capacities (e.g. ARM big.LITTLE) load-balancing has to consider task and CPU utilization as well as per-CPU compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks with high utilization that are scheduled on a lower capacity CPU need to be identified and migrated to a higher capacity CPU if possible to maximize throughput. To implement this additional policy an additional group_type (load-balance scenario) is added: 'group_misfit_task'. This represents scenarios where a sched_group has one or more tasks that are not suitable for its per-CPU capacity. 'group_misfit_task' is only considered if the system is not overloaded or imbalanced ('group_imbalanced' or 'group_overloaded'). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each CPU is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 2 ++ 2 files changed, 48 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e5071aeb117..6e04bea5b11a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -1446,7 +1447,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3647,6 +3647,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) WRITE_ONCE(p->se.avg.util_est, ue); } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3676,6 +3699,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -6201,7 +6225,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); } /* @@ -6618,9 +6642,12 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6826,6 +6853,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -7399,12 +7433,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7420,6 +7448,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7712,6 +7741,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7786,6 +7818,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) + sgs->group_misfit_task_load = rq->misfit_task_load; } /* Adjust by relative CPU capacity of the group */ @@ -9567,6 +9603,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0f36adc31ba5..7dbf67d147a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -842,6 +842,8 @@ struct rq { unsigned char idle_balance; + unsigned long misfit_task_load; + /* For active balancing */ int active_balance; int push_cpu; -- cgit v1.2.3-55-g7522 From e3d6d0cb66f2351cbfd09fbae04eb9804afe9577 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:41 +0100 Subject: sched/fair: Add sched_group per-CPU max capacity The current sg->min_capacity tracks the lowest per-CPU compute capacity available in the sched_group when rt/irq pressure is taken into account. Minimum capacity isn't the ideal metric for tracking if a sched_group needs offloading to another sched_group for some scenarios, e.g. a sched_group with multiple CPUs if only one is under heavy pressure. Tracking maximum capacity isn't perfect either but a better choice for some situations as it indicates that the sched_group definitely compute capacity constrained either due to rt/irq pressure on all CPUs or asymmetric CPU capacities (e.g. big.LITTLE). Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 ++++++++++++++++++++---- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 2 ++ 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e04bea5b11a..fe04315d57b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7557,13 +7557,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7577,6 +7578,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7607,6 +7609,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7620,12 +7623,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7721,16 +7726,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7876,7 +7892,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) return false; asym_packing: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7dbf67d147a2..fe17e0be2d7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1197,6 +1197,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b0cdf5e95bda..2536e1b938f9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -693,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } static int @@ -852,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } -- cgit v1.2.3-55-g7522 From cad68e552e7774b68ae6a2c5fedb792936098b72 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:42 +0100 Subject: sched/fair: Consider misfit tasks when load-balancing On asymmetric CPU capacity systems load intensive tasks can end up on CPUs that don't suit their compute demand. In this scenarios 'misfit' tasks should be migrated to CPUs with higher compute capacity to ensure better throughput. group_misfit_task indicates this scenario, but tweaks to the load-balance code are needed to make the migrations happen. Misfit balancing only makes sense between a source group of lower per-CPU capacity and destination group of higher compute capacity. Otherwise, misfit balancing is ignored. group_misfit_task has lowest priority so any imbalance due to overload is dealt with first. The modifications are: 1. Only pick a group containing misfit tasks as the busiest group if the destination group has higher capacity and has spare capacity. 2. When the busiest group is a 'misfit' group, skip the usual average load and group capacity checks. 3. Set the imbalance for 'misfit' balancing sufficiently high for a task to be pulled ignoring average load. 4. Pick the CPU with the highest misfit load as the source CPU. 5. If the misfit task is alone on the source CPU, go for active balancing. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe04315d57b3..24fe39e57bc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6890,6 +6890,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -7873,6 +7874,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * CPUs in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7895,6 +7907,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, group_smaller_min_cpu_capacity(sds->local, sg)) return false; + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) @@ -8192,8 +8211,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -8227,6 +8247,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -8293,6 +8319,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -8330,6 +8360,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8377,6 +8408,19 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + continue; + } + capacity = capacity_of(i); wl = weighted_cpuload(rq); @@ -8446,6 +8490,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_misfit_task) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } -- cgit v1.2.3-55-g7522 From 5fbdfae5221a5208ed8e7653fc1c4b31de420f74 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:43 +0100 Subject: sched/fair: Kick nohz balance if rq->misfit_task_load There already are a few conditions in nohz_kick_needed() to ensure a nohz kick is triggered, but they are not enough for some misfit task scenarios. Excluding asym packing, those are: - rq->nr_running >=2: Not relevant here because we are running a misfit task, it needs to be migrated regardless and potentially through active balance. - sds->nr_busy_cpus > 1: If there is only the misfit task being run on a group of low capacity CPUs, this will be evaluated to False. - rq->cfs.h_nr_running >=1 && check_cpu_capacity(): Not relevant here, misfit task needs to be migrated regardless of rt/IRQ pressure As such, this commit adds an rq->misfit_task_load condition to trigger a nohz kick. The idea to kick a nohz balance for misfit tasks originally came from Leo Yan , and a similar patch was submitted for the Android Common Kernel - see: https://lists.linaro.org/pipermail/eas-dev/2016-September/000551.html Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24fe39e57bc3..e08287d3806f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9135,7 +9135,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2) { + if (rq->nr_running >= 2 || rq->misfit_task_load) { flags = NOHZ_KICK_MASK; goto out; } -- cgit v1.2.3-55-g7522 From dbbad719449e06d73db21598d6eee178f7a54b3b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:44 +0100 Subject: sched/fair: Change 'prefer_sibling' type to bool This variable is entirely local to update_sd_lb_stats, so we can safely change its type and slightly clean up its initialisation. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e08287d3806f..23017939ecab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7982,11 +7982,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int load_idx; bool overload = false; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) -- cgit v1.2.3-55-g7522 From 575638d1047eb057a5cdf95cc0b3c084e1279508 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:45 +0100 Subject: sched/core: Change root_domain->overload type to int sizeof(_Bool) is implementation defined, so let's just go with 'int' as is done for other structures e.g. sched_domain_shared->has_idle_cores. The local 'overload' variable used in update_sd_lb_stats can remain bool, as it won't impact any struct layout and can be assigned to the root_domain field. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-8-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe17e0be2d7b..4d181478c5b8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -716,7 +716,7 @@ struct root_domain { cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ - bool overload; + int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -1698,7 +1698,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP if (!rq->rd->overload) - rq->rd->overload = true; + rq->rd->overload = 1; #endif } -- cgit v1.2.3-55-g7522 From e90c8fe15a3bf93a23088bcf1a56a0fa391d4e50 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:46 +0100 Subject: sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE() This variable can be read and set locklessly within update_sd_lb_stats(). As such, READ/WRITE_ONCE() are added to make sure nothing terribly wrong can happen because of the compiler. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-9-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23017939ecab..d9c4e97bfebd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8058,8 +8058,8 @@ next_group: if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } } @@ -9502,7 +9502,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4d181478c5b8..938063639793 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1697,8 +1697,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP - if (!rq->rd->overload) - rq->rd->overload = 1; + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); #endif } -- cgit v1.2.3-55-g7522 From 757ffdd705ee942fc8150b17942d968601d2a15b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:47 +0100 Subject: sched/fair: Set rq->rd->overload when misfit Idle balance is a great opportunity to pull a misfit task. However, there are scenarios where misfit tasks are present but idle balance is prevented by the overload flag. A good example of this is a workload of n identical tasks. Let's suppose we have a 2+2 Arm big.LITTLE system. We then spawn 4 fairly CPU-intensive tasks - for the sake of simplicity let's say they are just CPU hogs, even when running on big CPUs. They are identical tasks, so on an SMP system they should all end at (roughly) the same time. However, in our case the LITTLE CPUs are less performing than the big CPUs, so tasks running on the LITTLEs will have a longer completion time. This means that the big CPUs will complete their work earlier, at which point they should pull the tasks from the LITTLEs. What we want to happen is summarized as follows: a,b,c,d are our CPU-hogging tasks _ signifies idling LITTLE_0 | a a a a _ _ LITTLE_1 | b b b b _ _ ---------|------------- big_0 | c c c c a a big_1 | d d d d b b ^ ^ Tasks end on the big CPUs, idle balance happens and the misfit tasks are pulled straight away This however won't happen, because currently the overload flag is only set when there is any CPU that has more than one runnable task - which may very well not be the case here if our CPU-hogging workload is all there is to run. As such, this commit sets the overload flag in update_sg_lb_stats when a group is flagged as having a misfit task. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-10-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++-- kernel/sched/sched.h | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9c4e97bfebd..8b228c5b3eb4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7793,7 +7793,7 @@ static bool update_nohz_stats(struct rq *rq, bool force) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, @@ -7837,8 +7837,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) + sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } } /* Adjust by relative CPU capacity of the group */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 938063639793..85b3a2bf6c2b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -715,7 +715,11 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; - /* Indicate more than one runnable task for any CPU */ + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ int overload; /* -- cgit v1.2.3-55-g7522 From 4ad3831a9d4af5e36da5d44a3b9c6522d0353cee Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 4 Jul 2018 11:17:48 +0100 Subject: sched/fair: Don't move tasks to lower capacity CPUs unless necessary When lower capacity CPUs are load balancing and considering to pull something from a higher capacity group, we should not pull tasks from a CPU with only one task running as this is guaranteed to impede progress for that task. If there is more than one task running, load balance in the higher capacity group would have already made any possible moves to resolve imbalance and we should make better use of system compute capacity by moving a task if we still have more than one running. Signed-off-by: Chris Redpath Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-11-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8b228c5b3eb4..06ff75f4ac7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8423,6 +8423,17 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a CPU that could + * eventually lead to active_balancing high->low capacity. + * Higher per-CPU capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* -- cgit v1.2.3-55-g7522 From 9c63e84db29bcf584040931ad97c2edd11e35f6c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:50 +0100 Subject: sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains The 'prefer sibling' sched_domain flag is intended to encourage spreading tasks to sibling sched_domain to take advantage of more caches and core for SMT systems. It has recently been changed to be on all non-NUMA topology level. However, spreading across domains with CPU capacity asymmetry isn't desirable, e.g. spreading from high capacity to low capacity CPUs even if high capacity CPUs aren't overutilized might give access to more cache but the CPU will be slower and possibly lead to worse overall throughput. To prevent this, we need to remove SD_PREFER_SIBLING on the sched_domain level immediately below SD_ASYM_CPUCAPACITY. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-13-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2536e1b938f9..7ffad0d3a4eb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1126,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING + | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , @@ -1152,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; + /* + * Don't attempt to spread across CPUs of different capacities. + */ + if (sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; + for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; @@ -1173,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; + sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | @@ -1182,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { - sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; -- cgit v1.2.3-55-g7522 From 7e6f4c5d600c1c8e2a1d900e65cab319d9b6782e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Aug 2018 11:45:21 +0200 Subject: sched/debug: Explicitly cast sched_feat() to bool LLVM has a warning that tags expressions like: if (foo && non-bool-const) This pattern triggers for CONFIG_SCHED_DEBUG=n where sched_feat() ends up being whatever bit we select. Avoid the warning with an explicit cast to bool. Reported-by: Philipp Klocke Tested-by: Nick Desaulniers Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 85b3a2bf6c2b..3a4ef8f73f08 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1401,7 +1401,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = 0; #undef SCHED_FEAT -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -- cgit v1.2.3-55-g7522 From d90707ebebe03596e19de3abbf79b766e72a3465 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 29 Aug 2018 15:19:09 +0200 Subject: sched/numa: Remove unused code from update_numa_stats() With: commit 2d4056fafa19 ("sched/numa: Remove numa_has_capacity()") the local variables 'smt', 'cpus' and 'capacity' and their results are not used anymore in numa_has_capacity() Remove this unused code. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1535548752-4434-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06ff75f4ac7b..b65596fae06b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1463,8 +1463,7 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int smt, cpu, cpus = 0; - unsigned long capacity; + int cpu; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1473,26 +1472,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); - - cpus++; } - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll detect a huge imbalance and bail there. - */ - if (!cpus) - return; - - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); } struct task_numa_env { -- cgit v1.2.3-55-g7522 From 7477a3504e619768c9e972dafe2907e6b8ed9823 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 29 Aug 2018 15:19:10 +0200 Subject: sched/numa: Remove unused numa_stats::nr_running field nr_running in struct numa_stats is not used anywhere in the code. Remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1535548752-4434-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b65596fae06b..6bd142d19549 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1454,8 +1454,6 @@ struct numa_stats { /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - - unsigned int nr_running; }; /* @@ -1469,7 +1467,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); } -- cgit v1.2.3-55-g7522 From ace8031099f91480799b5929b4cccf2dcacc5136 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 3 Aug 2018 20:37:32 +0800 Subject: sched/topology: Make local variables static Fix the following warnings: kernel/sched/topology.c:10:15: warning: symbol 'sched_domains_tmpmask' was not declared. Should it be static? kernel/sched/topology.c:11:15: warning: symbol 'sched_domains_tmpmask2' was not declared. Should it be static? Signed-off-by: zhong jiang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1533299852-26941-1-git-send-email-zhongjiang@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7ffad0d3a4eb..9d74371e4aad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -7,8 +7,8 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ -cpumask_var_t sched_domains_tmpmask; -cpumask_var_t sched_domains_tmpmask2; +static cpumask_var_t sched_domains_tmpmask; +static cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG -- cgit v1.2.3-55-g7522 From 11d4afd4ff667f9b6178ee8c142c36cb78bd84db Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 25 Sep 2018 11:17:42 +0200 Subject: sched/pelt: Fix warning and clean up IRQ PELT config Create a config for enabling irq load tracking in the scheduler. irq load tracking is useful only when irq or paravirtual time is accounted but it's only possible with SMP for now. Also use __maybe_unused to remove the compilation warning in update_rq_clock_task() that has been introduced by: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()") Suggested-by: Ingo Molnar Reported-by: Dou Liyang Reported-by: Miguel Ojeda Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@alien8.de Cc: dou_liyang@163.com Fixes: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()") Link: http://lkml.kernel.org/r/1537867062-27285-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- init/Kconfig | 5 +++++ kernel/sched/core.c | 7 +++---- kernel/sched/fair.c | 2 +- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 2 +- kernel/sched/sched.h | 5 ++--- 6 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel/sched') diff --git a/init/Kconfig b/init/Kconfig index 1e234e2f1cba..317d5ccb5191 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING If in doubt, say N here. +config HAVE_SCHED_AVG_IRQ + def_bool y + depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING + depends on SMP + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad97f3ba5ec5..f2caf1bae4a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) * In theory, the compile should just see 0 here, and optimize out the call * to sched_rt_avg_update. But I don't trust it... */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif + s64 __maybe_unused steal = 0, irq_delta = 0; + #ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; @@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d92ed2eca8b..d59307ecd67d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7317,7 +7317,7 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; #endif diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 35475c0c5419..48a126486435 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -358,7 +358,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index d2894db28955..7e56b489ff32 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else static inline int diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 632804fa0b12..798b1afd5092 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -862,8 +862,7 @@ struct rq { struct sched_avg avg_rt; struct sched_avg avg_dl; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -#define HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif u64 idle_stamp; @@ -2223,7 +2222,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif -#ifdef HAVE_SCHED_AVG_IRQ +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { return rq->avg_irq.util_avg; -- cgit v1.2.3-55-g7522 From fdf5f315d5cfaefb7bb8a62ec4bf37b9891837aa Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 9 Aug 2018 14:57:53 +0100 Subject: sched/fair: Disable LB_BIAS by default LB_BIAS allows the adjustment on how conservative load should be balanced. The rq->cpu_load[idx] array is used for this functionality. It contains weighted CPU load decayed average values over different intervals (idx = 1..4). Idx = 0 is the weighted CPU load itself. The values are updated during scheduler_tick, before idle balance and at nohz exit. There are 5 different types of idx's per sched domain (sd). Each of them is used to index into the rq->cpu_load[idx] array in a specific scenario (busy, idle and newidle for load balancing, forkexec for wake-up slow-path load balancing and wake for affine wakeup based on weight). Only the sd idx's for busy and idle load balancing are set to 2,3 or 1,2 respectively. All the other sd idx's are set to 0. Conservative load balancing is achieved for sd idx's >= 1 by using the min/max (source_load()/target_load()) value between the current weighted CPU load and the rq->cpu_load[sd idx -1] for the busiest(idlest)/local CPU load in load balancing or vice versa in the wake-up slow-path load balancing. There is no conservative balancing for sd idx = 0 since only current weighted CPU load is used in this case. It is very likely that LB_BIAS' influence on load balancing can be neglected (see test results below). This is further supported by: (1) Weighted CPU load today is by itself a decayed average value (PELT) (cfs_rq->avg->runnable_load_avg) and not the instantaneous load (rq->load.weight) it was when LB_BIAS was introduced. (2) Sd imbalance_pct is used for CPU_NEWLY_IDLE and CPU_NOT_IDLE (relate to sd's newidle and busy idx) in find_busiest_group() when comparing busiest and local avg load to make load balancing even more conservative. (3) The sd forkexec and newidle idx are always set to 0 so there is no adjustment on how conservatively load balancing is done here. (4) Affine wakeup based on weight (wake_affine_weight()) will not be impacted since the sd wake idx is always set to 0. Let's disable LB_BIAS by default for a few kernel releases to make sure that no workload and no scheduler topology is affected. The benefit of being able to remove the LB_BIAS dependency from source_load() and target_load() is that the entire rq->cpu_load[idx] code could be removed in this case. It is really hard to say if there is no regression w/o testing this with a lot of different workloads on a lot of different platforms, especially NUMA machines. The following 104 LKP (Linux Kernel Performance) tests were run by the 0-Day guys mostly on multi-socket hosts with a larger number of logical cpus (88, 192). The base for the test was commit b3dae109fa89 ("sched/swait: Rename to exclusive") (tip/sched/core v4.18-rc1). Only 2 out of the 104 tests had a significant change in one of the metrics (fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-NoSync-performance +7% files_per_sec, unixbench/300s-100%-syscall-performance -11% score). Tests which showed a change in one of the metrics are marked with a '*' and this change is listed as well. (a) lkp-bdw-ep3: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 64G dd-write/10m-1HDD-cfq-btrfs-100dd-performance fsmark/1x-1t-1HDD-xfs-nfsv4-4M-60G-NoSync-performance * fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-NoSync-performance 7.50 7% 8.00 ± 6% fsmark.files_per_sec fsmark/1x-1t-1HDD-btrfs-nfsv4-4M-60G-fsyncBeforeClose-performance fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync-performance fsmark/1x-1t-1HDD-btrfs-4M-60G-fsyncBeforeClose-performance kbuild/300s-50%-vmlinux_prereq-performance kbuild/300s-200%-vmlinux_prereq-performance kbuild/300s-50%-vmlinux_prereq-performance-1HDD-ext4 kbuild/300s-200%-vmlinux_prereq-performance-1HDD-ext4 (b) lkp-skl-4sp1: 192 threads Intel(R) Xeon(R) Platinum 8160 768G dbench/100%-performance ebizzy/200%-100x-10s-performance hackbench/1600%-process-pipe-performance iperf/300s-cs-localhost-tcp-performance iperf/300s-cs-localhost-udp-performance perf-bench-numa-mem/2t-300M-performance perf-bench-sched-pipe/10000000ops-process-performance perf-bench-sched-pipe/10000000ops-threads-performance schbench/2-16-300-30000-30000-performance tbench/100%-cs-localhost-performance (c) lkp-bdw-ep6: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 128G stress-ng/100%-60s-pipe-performance unixbench/300s-1-whetstone-double-performance unixbench/300s-1-shell1-performance unixbench/300s-1-shell8-performance unixbench/300s-1-pipe-performance * unixbench/300s-1-context1-performance 312 315 unixbench.score unixbench/300s-1-spawn-performance unixbench/300s-1-syscall-performance unixbench/300s-1-dhry2reg-performance unixbench/300s-1-fstime-performance unixbench/300s-1-fsbuffer-performance unixbench/300s-1-fsdisk-performance unixbench/300s-100%-whetstone-double-performance unixbench/300s-100%-shell1-performance unixbench/300s-100%-shell8-performance unixbench/300s-100%-pipe-performance unixbench/300s-100%-context1-performance unixbench/300s-100%-spawn-performance * unixbench/300s-100%-syscall-performance 3571 ± 3% -11% 3183 ± 4% unixbench.score unixbench/300s-100%-dhry2reg-performance unixbench/300s-100%-fstime-performance unixbench/300s-100%-fsbuffer-performance unixbench/300s-100%-fsdisk-performance unixbench/300s-1-execl-performance unixbench/300s-100%-execl-performance * will-it-scale/brk1-performance 365004 360387 will-it-scale.per_thread_ops * will-it-scale/dup1-performance 432401 437596 will-it-scale.per_thread_ops will-it-scale/eventfd1-performance will-it-scale/futex1-performance will-it-scale/futex2-performance will-it-scale/futex3-performance will-it-scale/futex4-performance will-it-scale/getppid1-performance will-it-scale/lock1-performance will-it-scale/lseek1-performance will-it-scale/lseek2-performance * will-it-scale/malloc1-performance 47025 45817 will-it-scale.per_thread_ops 77499 76529 will-it-scale.per_process_ops will-it-scale/malloc2-performance * will-it-scale/mmap1-performance 123399 120815 will-it-scale.per_thread_ops 152219 149833 will-it-scale.per_process_ops * will-it-scale/mmap2-performance 107327 104714 will-it-scale.per_thread_ops 136405 133765 will-it-scale.per_process_ops will-it-scale/open1-performance * will-it-scale/open2-performance 171570 168805 will-it-scale.per_thread_ops 532644 526202 will-it-scale.per_process_ops will-it-scale/page_fault1-performance will-it-scale/page_fault2-performance will-it-scale/page_fault3-performance will-it-scale/pipe1-performance will-it-scale/poll1-performance * will-it-scale/poll2-performance 176134 172848 will-it-scale.per_thread_ops 281361 275053 will-it-scale.per_process_ops will-it-scale/posix_semaphore1-performance will-it-scale/pread1-performance will-it-scale/pread2-performance will-it-scale/pread3-performance will-it-scale/pthread_mutex1-performance will-it-scale/pthread_mutex2-performance will-it-scale/pwrite1-performance will-it-scale/pwrite2-performance will-it-scale/pwrite3-performance * will-it-scale/read1-performance 1190563 1174833 will-it-scale.per_thread_ops * will-it-scale/read2-performance 1105369 1080427 will-it-scale.per_thread_ops will-it-scale/readseek1-performance * will-it-scale/readseek2-performance 261818 259040 will-it-scale.per_thread_ops will-it-scale/readseek3-performance * will-it-scale/sched_yield-performance 2408059 2382034 will-it-scale.per_thread_ops will-it-scale/signal1-performance will-it-scale/unix1-performance will-it-scale/unlink1-performance will-it-scale/unlink2-performance * will-it-scale/write1-performance 976701 961588 will-it-scale.per_thread_ops * will-it-scale/writeseek1-performance 831898 822448 will-it-scale.per_thread_ops * will-it-scale/writeseek2-performance 228248 225065 will-it-scale.per_thread_ops * will-it-scale/writeseek3-performance 226670 224058 will-it-scale.per_thread_ops will-it-scale/context_switch1-performance aim7/performance-fork_test-2000 * aim7/performance-brk_test-3000 74869 76676 aim7.jobs-per-min aim7/performance-disk_cp-3000 aim7/performance-disk_rd-3000 aim7/performance-sieve-3000 aim7/performance-page_test-3000 aim7/performance-creat-clo-3000 aim7/performance-mem_rtns_1-8000 aim7/performance-disk_wrt-8000 aim7/performance-pipe_cpy-8000 aim7/performance-ram_copy-8000 (d) lkp-avoton3: 8 threads Intel(R) Atom(TM) CPU C2750 @ 2.40GHz 16G netperf/ipv4-900s-200%-cs-localhost-TCP_STREAM-performance Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Fengguang Wu Cc: Li Zhijian Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180809135753.21077-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 85ae8488039c..858589b83377 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, true) +SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks -- cgit v1.2.3-55-g7522 From 4a465e3ebbc8004ce4f7f08f6022ee8315a94edf Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 3 Aug 2018 15:05:38 +0100 Subject: sched/fair: Remove setting task's se->runnable_weight during PELT update A CFS (SCHED_OTHER, SCHED_BATCH or SCHED_IDLE policy) task's se->runnable_weight must always be in sync with its se->load.weight. se->runnable_weight is set to se->load.weight when the task is forked (init_entity_runnable_average()) or reniced (reweight_entity()). There are two cases in set_load_weight() which since they currently only set se->load.weight could lead to a situation in which se->load.weight is different to se->runnable_weight for a CFS task: (1) A task switches to SCHED_IDLE. (2) A SCHED_FIFO, SCHED_RR or SCHED_DEADLINE task which has been reniced (during which only its static priority gets set) switches to SCHED_OTHER or SCHED_BATCH. Set se->runnable_weight to se->load.weight in these two cases to prevent this. This eliminates the need to explicitly set it to se->load.weight during PELT updates in the CFS scheduler fastpath. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20180803140538.1178-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/pelt.c | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f2caf1bae4a3..56b3c1781276 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -700,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; + p->se.runnable_weight = load->weight; return; } @@ -712,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; + p->se.runnable_weight = load->weight; } } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 48a126486435..90fb5bc12ad4 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; @@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { -- cgit v1.2.3-55-g7522 From 9c2298aad355d8c1957df3015448fef333526934 Mon Sep 17 00:00:00 2001 From: Rafael J. Wysocki Date: Thu, 4 Oct 2018 11:05:14 +0200 Subject: sched/core: Fix comment regarding nr_iowait_cpu() and get_iowait_load() The comment related to nr_iowait_cpu() and get_iowait_load() confuses cpufreq with cpuidle and is not very useful for this reason, so fix it. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Linux PM Cc: Tejun Heo Cc: Thomas Gleixner Fixes: e33a9bba85a8 "sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler" Link: http://lkml.kernel.org/r/3803514.xkx7zY50tF@aspire.rjw.lan Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 56b3c1781276..fe0223121883 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2916,10 +2916,10 @@ unsigned long nr_iowait(void) } /* - * Consumers of these two interfaces, like for example the cpufreq menu - * governor are using nonsensical data. Boosting frequency for a CPU that has - * IO-wait which might not even end up running the task when it does become - * runnable. + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. */ unsigned long nr_iowait_cpu(int cpu) -- cgit v1.2.3-55-g7522