diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/autogroup.c | 3 | ||||
-rw-r--r-- | kernel/sched/completion.c | 30 | ||||
-rw-r--r-- | kernel/sched/core.c | 64 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 27 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 98 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 85 | ||||
-rw-r--r-- | kernel/sched/debug.c | 85 | ||||
-rw-r--r-- | kernel/sched/fair.c | 502 | ||||
-rw-r--r-- | kernel/sched/idle.c | 8 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 152 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 25 | ||||
-rw-r--r-- | kernel/sched/swait.c | 6 | ||||
-rw-r--r-- | kernel/sched/topology.c | 41 |
16 files changed, 759 insertions, 372 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index da39489d2d80..de6d7f4dfcb5 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void) goto out_fail; tg = sched_create_group(&root_task_group); - if (IS_ERR(tg)) goto out_free; @@ -101,7 +100,7 @@ out_free: out_fail: if (printk_ratelimit()) { printk(KERN_WARNING "autogroup_create: %s failure.\n", - ag ? "sched_create_group()" : "kmalloc()"); + ag ? "sched_create_group()" : "kzalloc()"); } return autogroup_kref_get(&autogroup_default); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..cc873075c3bd 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -32,6 +32,12 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); + + /* + * Perform commit of crossrelease here. + */ + complete_release_commit(x); + if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); @@ -47,6 +53,13 @@ EXPORT_SYMBOL(complete); * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. + * + * Since complete_all() sets the completion of @x permanently to done + * to allow multiple waiters to finish, a call to reinit_completion() + * must be used on @x if @x is to be used again. The code must make + * sure that all waiters have woken and finished before reinitializing + * @x. Also note that the function completion_done() can not be used + * to know if there are still waiters after complete_all() has been called. */ void complete_all(struct completion *x) { @@ -92,9 +105,14 @@ __wait_for_common(struct completion *x, { might_sleep(); + complete_acquire(x); + spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); + + complete_release(x); + return timeout; } @@ -297,9 +315,12 @@ EXPORT_SYMBOL(try_wait_for_completion); * Return: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * + * Note, this will always return true if complete_all() was called on @X. */ bool completion_done(struct completion *x) { + unsigned long flags; + if (!READ_ONCE(x->done)) return false; @@ -307,14 +328,9 @@ bool completion_done(struct completion *x) * If ->done, we need to wait for complete() to release ->wait.lock * otherwise we can end up freeing the completion before complete() * is done referencing it. - * - * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders - * the loads of ->done and ->wait.lock such that we cannot observe - * the lock before complete() acquires it while observing the ->done - * after it's acquired the lock. */ - smp_rmb(); - spin_unlock_wait(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20fba81..6d2c7ff9ba98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return rq; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return rq; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) @@ -1967,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with mb() in * set_current_state() the waiting thread does. */ - smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); if (!(p->state & state)) goto out; @@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * + * TODO: This smp_mb__after_unlock_lock can go away if PPC end + * up adding a full barrier to switch_mm(), or we should figure + * out if a smp_mb__after_unlock_lock is really the proper API + * to use. + */ + smp_mb__after_unlock_lock(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -3281,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ - smp_mb__before_spinlock(); rq_lock(rq, &rf); + smp_mb__after_spinlock(); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; @@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. For TSO + * (e.g. x86), the architecture must provide its own + * barrier in switch_mm(). For weakly ordered machines + * for which spin_unlock() acts as a full memory + * barrier, finish_lock_switch() in common code takes + * care of this barrier. For weakly ordered machines for + * which spin_unlock() acts as a RELEASE barrier (only + * arm64 and PowerPC), arm64 has a full barrier in + * switch_to(), and PowerPC has + * smp_mb__after_unlock_lock() before + * finish_lock_switch(). + */ ++*switch_count; trace_sched_switch(preempt, prev, next); @@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void) * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ - smp_mb(); - raw_spin_unlock_wait(¤t->pi_lock); + raw_spin_lock_irq(¤t->pi_lock); + raw_spin_unlock_irq(¤t->pi_lock); /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); @@ -5103,24 +5133,17 @@ out_unlock: return retval; } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned long state = p->state; - - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); if (!try_get_task_stack(p)) return; - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); - if (state == TASK_RUNNING) + + printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + + if (p->state == TASK_RUNNING) printk(KERN_CONT " running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); @@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) */ next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); - next->sched_class->put_prev_task(rq, next); + put_prev_task(rq, next); /* * Rules for changing task_struct::cpus_allowed are holding diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index fba235c7d026..8d9562d890d3 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp) * @p: the task * @later_mask: a mask to fill in with the selected CPUs (or NULL) * - * Returns: int - best CPU (heap maximum if suitable) + * Returns: int - CPUs were found */ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask) { - int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { - best_cpu = cpumask_any(later_mask); - goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && - dl_time_before(dl_se->deadline, cp->elements[0].dl)) { - best_cpu = cpudl_maximum(cp); - if (later_mask) - cpumask_set_cpu(best_cpu, later_mask); - } + return 1; + } else { + int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); -out: - WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); - return best_cpu; + return 1; + } + } + return 0; } /* @@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp) { int i; - memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 29a397067ffa..9209d83ecdcf 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -52,9 +52,11 @@ struct sugov_policy { struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + unsigned int cpu; - unsigned long iowait_boost; - unsigned long iowait_boost_max; + bool iowait_boost_pending; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; /* The fields below are only needed when sharing a policy. */ @@ -76,6 +78,26 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) { s64 delta_ns; + /* + * Since cpufreq_update_util() is called with rq->lock held for + * the @target_cpu, our per-cpu data is fully serialized. + * + * However, drivers cannot in general deal with cross-cpu + * requests, so while get_next_freq() will work, our + * sugov_update_commit() call may not for the fast switching platforms. + * + * Hence stop here for remote requests if they aren't supported + * by the hardware, as calculating the frequency is pointless if + * we cannot in fact act on it. + * + * For the slow switching platforms, the kthread is always scheduled on + * the right set of CPUs and any CPU can find the next frequency and + * schedule the kthread. + */ + if (sg_policy->policy->fast_switch_enabled && + !cpufreq_can_do_remote_dvfs(sg_policy->policy)) + return false; + if (sg_policy->work_in_progress) return false; @@ -106,7 +128,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (policy->fast_switch_enabled) { next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (next_freq == CPUFREQ_ENTRY_INVALID) + if (!next_freq) return; policy->cur = next_freq; @@ -154,12 +176,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max) +static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) { - struct rq *rq = this_rq(); + struct rq *rq = cpu_rq(cpu); unsigned long cfs_max; - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + cfs_max = arch_scale_cpu_capacity(NULL, cpu); *util = min(rq->cfs.avg.util_avg, cfs_max); *max = cfs_max; @@ -169,30 +191,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { if (flags & SCHED_CPUFREQ_IOWAIT) { - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + if (sg_cpu->iowait_boost_pending) + return; + + sg_cpu->iowait_boost_pending = true; + + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else { + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + } } else if (sg_cpu->iowait_boost) { s64 delta_ns = time - sg_cpu->last_update; /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } } } static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util = sg_cpu->iowait_boost; - unsigned long boost_max = sg_cpu->iowait_boost_max; + unsigned int boost_util, boost_max; - if (!boost_util) + if (!sg_cpu->iowait_boost) return; + if (sg_cpu->iowait_boost_pending) { + sg_cpu->iowait_boost_pending = false; + } else { + sg_cpu->iowait_boost >>= 1; + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + sg_cpu->iowait_boost = 0; + return; + } + } + + boost_util = sg_cpu->iowait_boost; + boost_max = sg_cpu->iowait_boost_max; + if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; } - sg_cpu->iowait_boost >>= 1; } #ifdef CONFIG_NO_HZ_COMMON @@ -229,7 +275,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, sg_cpu->cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -264,6 +310,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; + j_sg_cpu->iowait_boost_pending = false; continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) @@ -290,7 +337,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, sg_cpu->cpu); raw_spin_lock(&sg_policy->update_lock); @@ -445,7 +492,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + + /* Kthread is bound to all CPUs by default */ + if (!policy->dvfs_possible_from_any_cpu) + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -528,16 +579,7 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - if (policy->transition_delay_us) { - tunables->rate_limit_us = policy->transition_delay_us; - } else { - unsigned int lat; - - tunables->rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; - } + tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); policy->governor_data = sg_policy; sg_policy->tunables = tunables; @@ -655,6 +697,7 @@ static void sugov_limits(struct cpufreq_policy *policy) static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, + .dynamic_switching = true, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, @@ -671,6 +714,11 @@ struct cpufreq_governor *cpufreq_default_governor(void) static int __init sugov_register(void) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(sugov_cpu, cpu).cpu = cpu; + return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..2511aba36b89 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp) { int i; - memset(cp, 0, sizeof(*cp)); - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..0191ec7667c3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->rb_leftmost == &dl_se->rb_node; + return dl_rq->root.rb_leftmost == &dl_se->rb_node; } void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) @@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b) void init_dl_rq(struct dl_rq *dl_rq) { - dl_rq->rb_root = RB_ROOT; + dl_rq->root = RB_ROOT_CACHED; #ifdef CONFIG_SMP /* zero means no -deadline tasks */ @@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; - dl_rq->pushable_dl_tasks_root = RB_ROOT; + dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else init_dl_bw(&dl_rq->dl_bw); #endif @@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; struct rb_node *parent = NULL; struct task_struct *entry; - int leftmost = 1; + bool leftmost = true; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); @@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) link = &parent->rb_left; else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) { - dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + if (leftmost) dl_rq->earliest_dl.next = p->dl.deadline; - } rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_insert_color_cached(&p->pushable_dl_tasks, + &dl_rq->pushable_dl_tasks_root, leftmost); } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { struct rb_node *next_node; next_node = rb_next(&p->pushable_dl_tasks); - dl_rq->pushable_dl_tasks_leftmost = next_node; if (next_node) { dl_rq->earliest_dl.next = rb_entry(next_node, struct task_struct, pushable_dl_tasks)->dl.deadline; } } - rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } static inline int has_pushable_dl_tasks(struct rq *rq) { - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } static int push_dl_task(struct rq *rq); @@ -1136,7 +1134,7 @@ static void update_curr_dl(struct rq *rq) } /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); + cpufreq_update_util(rq, SCHED_CPUFREQ_DL); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { - struct rb_node *leftmost = dl_rq->rb_leftmost; + struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); @@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node **link = &dl_rq->root.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_dl_entity *entry; int leftmost = 1; @@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) } } - if (leftmost) - dl_rq->rb_leftmost = &dl_se->rb_node; - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); inc_dl_tasks(dl_se, dl_rq); } @@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) if (RB_EMPTY_NODE(&dl_se->rb_node)) return; - if (dl_rq->rb_leftmost == &dl_se->rb_node) { - struct rb_node *next_node; - - next_node = rb_next(&dl_se->rb_node); - dl_rq->rb_leftmost = next_node; - } - - rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + rb_erase_cached(&dl_se->rb_node, &dl_rq->root); RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); @@ -1594,7 +1582,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) return; /* @@ -1602,7 +1590,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + cpudl_find(&rq->rd->cpudl, p, NULL)) return; resched_curr(rq); @@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { - struct rb_node *left = dl_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; @@ -1655,7 +1643,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct * +static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; @@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; if (!has_pushable_dl_tasks(rq)) @@ -1798,7 +1786,7 @@ static int find_later_rq(struct task_struct *task) struct sched_domain *sd; struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); - int best_cpu, cpu = task_cpu(task); + int cpu = task_cpu(task); /* Make sure the mask is initialized first */ if (unlikely(!later_mask)) @@ -1811,17 +1799,14 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, - task, later_mask); - if (best_cpu == -1) + if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; /* - * If we are here, some target has been found, - * the most suitable of which is cached in best_cpu. - * This is, among the runqueues where the current tasks - * have later deadlines than the task's one, the rq - * with the latest possible one. + * If we are here, some targets have been found, including + * the most suitable which is, among the runqueues where the + * current tasks have later deadlines than the task's one, the + * rq with the latest possible one. * * Now we check how well this matches with task's * affinity and system topology. @@ -1841,6 +1826,7 @@ static int find_later_rq(struct task_struct *task) rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; /* * If possible, preempting this_cpu is @@ -1852,12 +1838,15 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } + best_cpu = cpumask_first_and(later_mask, + sched_domain_span(sd)); /* - * Last chance: if best_cpu is valid and is - * in the mask, that becomes our choice. + * Last chance: if a cpu being in both later_mask + * and current sd span is valid, that becomes our + * choice. Of course, the latest possible cpu is + * already under consideration through later_mask. */ - if (best_cpu < nr_cpu_ids && - cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; } @@ -1944,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, struct task_struct, pushable_dl_tasks); BUG_ON(rq->cpu != task_cpu(p)); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fa66de52bd6..8e536d963652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } +static cpumask_var_t sd_sysctl_cpus; static struct ctl_table_header *sd_sysctl_header; + void register_sched_domain_sysctl(void) { - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + static struct ctl_table *cpu_entries; + static struct ctl_table **cpu_idx; char buf[32]; + int i; - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; + if (!cpu_entries) { + cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); + if (!cpu_entries) + return; - if (entry == NULL) - return; + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = cpu_entries; + } - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; + if (!cpu_idx) { + struct ctl_table *e = cpu_entries; + + cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); + if (!cpu_idx) + return; + + /* deal with sparse possible map */ + for_each_possible_cpu(i) { + cpu_idx[i] = e; + e++; + } + } + + if (!cpumask_available(sd_sysctl_cpus)) { + if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) + return; + + /* init to possible to not have holes in @cpu_entries */ + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); + } + + for_each_cpu(i, sd_sysctl_cpus) { + struct ctl_table *e = cpu_idx[i]; + + if (e->child) + sd_free_ctl_entry(&e->child); + + if (!e->procname) { + snprintf(buf, 32, "cpu%d", i); + e->procname = kstrdup(buf, GFP_KERNEL); + } + e->mode = 0555; + e->child = sd_alloc_ctl_cpu_table(i); + + __cpumask_clear_cpu(i, sd_sysctl_cpus); } WARN_ON(sd_sysctl_header); sd_sysctl_header = register_sysctl_table(sd_ctl_root); } +void dirty_sched_domain_sysctl(int cpu) +{ + if (cpumask_available(sd_sysctl_cpus)) + __cpumask_set_cpu(cpu, sd_sysctl_cpus); +} + /* may be called multiple times per register */ void unregister_sched_domain_sysctl(void) { unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); } #endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ @@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg) } #endif +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { if (rq->curr == p) - SEQ_printf(m, "R"); + SEQ_printf(m, ">R"); else - SEQ_printf(m, " "); + SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), @@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" + " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n" - "------------------------------------------------------" + "-------------------------------------------------------" "----------------------------------------------------\n"); rcu_read_lock(); @@ -488,7 +530,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) + if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) @@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) #endif } -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..a5d83ed8dd82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); + if (leftmost) { /* non-empty tree */ + struct sched_entity *se; + se = rb_entry(leftmost, struct sched_entity, run_node); if (!curr) vruntime = se->vruntime; @@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - int leftmost = 1; + bool leftmost = true; /* * Find the right place in the rbtree: @@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color_cached(&se->run_node, + &cfs_rq->tasks_timeline, leftmost); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = cfs_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; @@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; @@ -806,7 +793,7 @@ void post_init_entity_util_avg(struct sched_entity *se) /* * For !fair tasks do: * - update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * @@ -1071,6 +1058,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -1107,13 +1117,47 @@ static unsigned int task_scan_min(struct task_struct *p) return max_t(unsigned int, floor, scan); } +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + + return max(smin, period); +} + static unsigned int task_scan_max(struct task_struct *p) { - unsigned int smin = task_scan_min(p); - unsigned int smax; + unsigned long smin = task_scan_min(p); + unsigned long smax; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + if (p->numa_group) { + struct numa_group *ng = p->numa_group; + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= atomic_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + return max(smin, smax); } @@ -1129,26 +1173,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - int active_nodes; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long max_faults_cpu; - /* - * Faults_cpu is used to decide whether memory should move - * towards the CPU. As a consequence, these stats are weighted - * more by CPU use than by memory faults. - */ - unsigned long *faults_cpu; - unsigned long faults[0]; -}; - /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES 2 @@ -1198,6 +1222,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + + return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; + } + + return faults; +} + /* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations @@ -1378,7 +1426,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(const int cpu); +static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); @@ -1409,7 +1457,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) struct rq *rq = cpu_rq(cpu); ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); cpus++; @@ -1808,7 +1856,7 @@ static int task_numa_migrate(struct task_struct *p) * Reset the scan period if the task is being rescheduled on an * alternative node to recheck if the tasks is now properly placed. */ - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -1892,7 +1940,7 @@ static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private) { unsigned int period_slot; - int ratio; + int lr_ratio, ps_ratio; int diff; unsigned long remote = p->numa_faults_locality[0]; @@ -1922,25 +1970,36 @@ static void update_task_scan_period(struct task_struct *p, * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) */ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are local. There is no need to + * do fast NUMA scanning, since memory is already local. + */ + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are shared with other tasks. + * There is no point in continuing fast NUMA scanning, + * since other tasks may just move the memory elsewhere. + */ + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; if (!slot) slot = 1; diff = slot * period_slot; } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - /* - * Scale scan rate increases based on sharing. There is an - * inverse relationship between the degree of sharing and - * the adjustment made to the scanning period. Broadly - * speaking the intent is that there is little point - * scanning faster if shared accesses dominate as it may - * simply bounce migrations uselessly + * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, + * yet they are not on the local NUMA node. Speed up + * NUMA scanning to get the memory moved over. */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + int ratio = max(lr_ratio, ps_ratio); + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; } p->numa_scan_period = clamp(p->numa_scan_period + diff, @@ -2448,7 +2507,7 @@ void task_numa_work(struct callback_head *work) if (p->numa_scan_period == 0) { p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); + p->numa_scan_period = task_scan_start(p); } next_scan = now + msecs_to_jiffies(p->numa_scan_period); @@ -2576,7 +2635,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now > curr->node_stamp + period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); + curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { @@ -2586,59 +2645,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } -/* - * Can a task be moved from prev_cpu to this_cpu without causing a load - * imbalance that would trigger the load balancer? - */ -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - struct numa_stats prev_load, this_load; - s64 this_eff_load, prev_eff_load; - - update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); - update_numa_stats(&this_load, cpu_to_node(this_cpu)); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - unsigned long current_load = task_h_load(current); - - if (this_load.load > current_load) - this_load.load -= current_load; - else - this_load.load = 0; - } - - /* - * In low-load situations, where this_cpu's node is idle due to the - * sync cause above having dropped this_load.load to 0, move the task. - * Moving to an idle socket will not create a bad imbalance. - * - * Otherwise check if the nodes are near enough in load to allow this - * task to be woken on this_cpu's node. - */ - if (this_load.load > 0) { - unsigned long task_load = task_h_load(p); - - this_eff_load = 100; - this_eff_load *= prev_load.compute_capacity; - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= this_load.compute_capacity; - - this_eff_load *= this_load.load + task_load; - prev_eff_load *= prev_load.load - task_load; - - return this_eff_load <= prev_eff_load; - } - - return true; -} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2652,14 +2658,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } -#ifdef CONFIG_SMP -static inline bool numa_wake_affine(struct sched_domain *sd, - struct task_struct *p, int this_cpu, - int prev_cpu, int sync) -{ - return true; -} -#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void @@ -2790,6 +2788,31 @@ static inline void update_cfs_shares(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (&rq->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq, 0); + } +} + #ifdef CONFIG_SMP /* * Approximate: @@ -2968,6 +2991,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, sa->last_update_time += delta << 10; /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!weight) + running = 0; + + /* * Now we know we crossed measurement unit boundaries. The *_avg * accrues by two steps: * @@ -3276,29 +3311,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) -{ - if (&this_rq()->cfs == cfs_rq) { - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_of(cfs_rq), 0); - } -} - /* * Unsigned subtract and clamp on underflow. * @@ -3320,7 +3332,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() * @cfs_rq: cfs_rq to update - * @update_freq: should we call cfs_rq_util_change() or will the call do so * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * avg. The immediate corollary is that all (fair) tasks must be attached, see @@ -3334,7 +3345,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * call update_tg_load_avg() when this function returns true. */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { struct sched_avg *sa = &cfs_rq->avg; int decayed, removed_load = 0, removed_util = 0; @@ -3362,7 +3373,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (update_freq && (decayed || removed_util)) + if (decayed || removed_util) cfs_rq_util_change(cfs_rq); return decayed || removed_load; @@ -3390,7 +3401,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cpu, cfs_rq, se); - decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -3534,7 +3545,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { return 0; } @@ -3544,7 +3555,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) static inline void update_load_avg(struct sched_entity *se, int not_used1) { - cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); + cfs_rq_util_change(cfs_rq_of(se)); } static inline void @@ -4875,7 +4886,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * passed. */ if (p->in_iowait) - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { if (se->on_rq) @@ -5125,9 +5136,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, } /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(struct rq *rq) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load_avg(&rq->cfs); } #ifdef CONFIG_NO_HZ_COMMON @@ -5172,7 +5183,7 @@ static void cpu_load_update_idle(struct rq *this_rq) /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (weighted_cpuload(this_rq)) return; cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -5193,7 +5204,7 @@ void cpu_load_update_nohz_start(void) * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = weighted_cpuload(this_rq); } /* @@ -5209,7 +5220,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return; - load = weighted_cpuload(cpu_of(this_rq)); + load = weighted_cpuload(this_rq); rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -5235,7 +5246,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = weighted_cpuload(this_rq); if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -5253,7 +5264,7 @@ void cpu_load_update_active(struct rq *this_rq) static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5268,7 +5279,7 @@ static unsigned long source_load(int cpu, int type) static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq); if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5290,7 +5301,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = weighted_cpuload(rq); if (nr_running) return load_avg / nr_running; @@ -5345,20 +5356,115 @@ static int wake_wide(struct task_struct *p) return 1; } +struct llc_stats { + unsigned long nr_running; + unsigned long load; + unsigned long capacity; + int has_capacity; +}; + +static bool get_llc_stats(struct llc_stats *stats, int cpu) +{ + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + + if (!sds) + return false; + + stats->nr_running = READ_ONCE(sds->nr_running); + stats->load = READ_ONCE(sds->load); + stats->capacity = READ_ONCE(sds->capacity); + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); + + return true; +} + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + * + * Since we're running on 'stale' values, we might in fact create an imbalance + * but recomputing these values is expensive, as that'd mean iteration 2 cache + * domains worth of CPUs. + */ +static bool +wake_affine_llc(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + struct llc_stats prev_stats, this_stats; + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + if (!get_llc_stats(&prev_stats, prev_cpu) || + !get_llc_stats(&this_stats, this_cpu)) + return false; + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current LLC. + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + /* in this case load hits 0 and this LLC is considered 'idle' */ + if (current_load > this_stats.load) + return true; + + this_stats.load -= current_load; + } + + /* + * The has_capacity stuff is not SMT aware, but by trying to balance + * the nr_running on both ends we try and fill the domain at equal + * rates, thereby first consuming cores before siblings. + */ + + /* if the old cache has capacity, stay there */ + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) + return false; + + /* if this cache has capacity, come here */ + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) + return true; + + /* + * Check to see if we can move the load without causing too much + * imbalance. + */ + task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_stats.capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_stats.capacity; + + this_eff_load *= this_stats.load + task_load; + prev_eff_load *= prev_stats.load - task_load; + + return this_eff_load <= prev_eff_load; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + bool affine; /* - * Common case: CPUs are in the same socket, and select_idle_sibling() - * will do its thing regardless of what we return: + * Default to no affine wakeups; wake_affine() should not effect a task + * placement the load-balancer feels inclined to undo. The conservative + * option is therefore to not move tasks when they wake up. */ - if (cpus_share_cache(prev_cpu, this_cpu)) - affine = true; - else - affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); + affine = false; + + /* + * If the wakeup is across cache domains, try to evaluate if movement + * makes sense, otherwise rely on select_idle_siblings() to do + * placement inside the cache domain. + */ + if (!cpus_share_cache(prev_cpu, this_cpu)) + affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (affine) { @@ -5550,7 +5656,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = weighted_cpuload(cpu_rq(i)); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -6187,10 +6293,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: -#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) goto idle; +#ifdef CONFIG_FAIR_GROUP_SCHED if (prev->sched_class != &fair_sched_class) goto simple; @@ -6220,11 +6326,17 @@ again: /* * This call to check_cfs_rq_runtime() will do the * throttle and dequeue its entity in the parent(s). - * Therefore the 'simple' nr_running test will indeed + * Therefore the nr_running test will indeed * be correct. */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) + if (unlikely(check_cfs_rq_runtime(cfs_rq))) { + cfs_rq = &rq->cfs; + + if (!cfs_rq->nr_running) + goto idle; + goto simple; + } } se = pick_next_entity(cfs_rq, curr); @@ -6264,12 +6376,8 @@ again: return p; simple: - cfs_rq = &rq->cfs; #endif - if (!cfs_rq->nr_running) - goto idle; - put_prev_task(rq, prev); do { @@ -6917,7 +7025,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -6990,7 +7098,7 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); rq_unlock_irqrestore(rq, &rf); } @@ -7036,6 +7144,7 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ + unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ @@ -7055,6 +7164,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, + .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { @@ -7363,7 +7473,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7490,6 +7600,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { + struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -7546,6 +7657,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ + sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -7561,6 +7673,21 @@ next_group: env->dst_rq->rd->overload = overload; } + if (!shared) + return; + + /* + * Since these are sums over groups they can contain some CPUs + * multiple times for the NUMA domains. + * + * Currently only wake_affine_llc() and find_busiest_group() + * uses these numbers, only the last is affected by this problem. + * + * XXX fix that. + */ + WRITE_ONCE(shared->nr_running, sds->total_running); + WRITE_ONCE(shared->load, sds->total_load); + WRITE_ONCE(shared->capacity, sds->total_capacity); } /** @@ -7790,6 +7917,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + /* XXX broken for overlapping NUMA groups */ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; @@ -7892,7 +8020,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(rq); /* * When comparing with imbalance, use weighted_cpuload() @@ -9171,7 +9299,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { - cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6c23e30c0e5c..257f4f0b4532 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,7 +158,7 @@ static void cpuidle_idle_call(void) } /* - * Suspend-to-idle ("freeze") is a system state in which all user space + * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only * activity happens here and in iterrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state @@ -167,9 +167,9 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ - if (idle_should_freeze() || dev->use_deepest_state) { - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); + if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle()) { + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/syscalls.h> +#include <linux/membarrier.h> +#include <linux/tick.h> +#include <linux/cpumask.h> + +#include "sched.h" /* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ + smp_mb(); /* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm == current->mm) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + { + int cmd_mask = MEMBARRIER_CMD_BITMASK; + + if (tick_nohz_full_enabled()) + cmd_mask &= ~MEMBARRIER_CMD_SHARED; + return cmd_mask; + } + case MEMBARRIER_CMD_SHARED: + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -EINVAL; + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + membarrier_private_expedited(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 45caf937ef90..0af5ca9e3e3f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq) return; /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); + cpufreq_update_util(rq, SCHED_CPUFREQ_RT); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..746ac78ff492 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -426,8 +426,7 @@ struct cfs_rq { u64 min_vruntime_copy; #endif - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. @@ -550,8 +549,7 @@ struct rt_rq { /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root rb_root; - struct rb_node *rb_leftmost; + struct rb_root_cached root; unsigned long dl_nr_running; @@ -575,8 +573,7 @@ struct dl_rq { * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root pushable_dl_tasks_root; - struct rb_node *pushable_dl_tasks_leftmost; + struct rb_root_cached pushable_dl_tasks_root; #else struct dl_bw dl_bw; #endif @@ -769,7 +766,7 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; - struct call_single_data hrtick_csd; + call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; #endif @@ -1120,11 +1117,15 @@ extern int group_balance_cpu(struct sched_group *sg); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); void unregister_sched_domain_sysctl(void); #else static inline void register_sched_domain_sysctl(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} static inline void unregister_sched_domain_sysctl(void) { } @@ -2070,19 +2071,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); if (data) data->func(data, rq_clock(rq), flags); } - -static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -{ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_update_util(rq, flags); -} #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 3d5610dcce11..2227e183e202 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q) { unsigned long flags; - if (!swait_active(q)) - return; - raw_spin_lock_irqsave(&q->lock, flags); swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); @@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); - if (!swait_active(q)) - return; - raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 79895aec281e..5d0062cc10cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) static int init_rootdomain(struct root_domain *rd) { - memset(rd, 0, sizeof(*rd)); - if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) @@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void) { struct root_domain *rd; - rd = kmalloc(sizeof(*rd), GFP_KERNEL); + rd = kzalloc(sizeof(*rd), GFP_KERNEL); if (!rd) return NULL; @@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) kfree(sg->sgc); - kfree(sg); + if (atomic_dec_and_test(&sg->ref)) + kfree(sg); sg = tmp; } while (sg != first); } @@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) static void destroy_sched_domain(struct sched_domain *sd) { /* - * If its an overlapping domain it has private groups, iterate and - * nuke them all. + * A normal sched domain may have multiple group references, an + * overlapping domain, having private groups, only one. Iterate, + * dropping group/capacity references, freeing where none remain. */ - if (sd->flags & SD_OVERLAP) { - free_sched_groups(sd->groups, 1); - } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgc); - kfree(sd->groups); - } + free_sched_groups(sd->groups, 1); + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); kfree(sd); @@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + dirty_sched_domain_sysctl(cpu); destroy_sched_domains(tmp); update_top_cache_domain(cpu); @@ -476,7 +473,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; @@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) else cpumask_copy(sg_span, sched_domain_span(sd)); + atomic_inc(&sg->ref); return sg; } @@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } } -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, +static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { @@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); - n = doms_new ? ndoms_new : 0; + if (!doms_new) { + WARN_ON_ONCE(dattr_new); + n = 0; + doms_new = alloc_sched_domains(1); + if (doms_new) { + n = 1; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + } + } else { + n = ndoms_new; + } /* Destroy deleted domains: */ for (i = 0; i < ndoms_cur; i++) { @@ -1870,11 +1878,10 @@ match1: } n = ndoms_cur; - if (doms_new == NULL) { + if (!doms_new) { n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); } /* Build new domains: */ |