summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c738
1 files changed, 580 insertions, 158 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..aa14a56f9d03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#include "workqueue_sched.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -425,9 +426,7 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
atomic_t rto_count;
-#ifdef CONFIG_SMP
struct cpupri cpupri;
-#endif
};
/*
@@ -436,7 +435,7 @@ struct root_domain {
*/
static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
/*
* This is the main, per-CPU runqueue data structure.
@@ -456,9 +455,10 @@ struct rq {
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
- unsigned char in_nohz_recently;
+ unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;
@@ -486,11 +486,12 @@ struct rq {
*/
unsigned long nr_uninterruptible;
- struct task_struct *curr, *idle;
+ struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
+ u64 clock_task;
atomic_t nr_iowait;
@@ -518,6 +519,10 @@ struct rq {
u64 avg_idle;
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
#endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
inline void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update)
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update) {
+ int cpu = cpu_of(rq);
+ u64 irq_time;
+
+ rq->clock = sched_clock_cpu(cpu);
+ irq_time = irq_time_cpu(cpu);
+ if (rq->clock - irq_time > rq->clock_task)
+ rq->clock_task = rq->clock - irq_time;
+
+ sched_irq_time_avg_update(rq, irq_time);
+ }
}
/*
@@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
- char *cmp = buf;
+ char *cmp;
int neg = 0;
int i;
@@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
return -EFAULT;
buf[cnt] = 0;
+ cmp = strstrip(buf);
if (strncmp(buf, "NO_", 3) == 0) {
neg = 1;
@@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
for (i = 0; sched_feat_names[i]; i++) {
- int len = strlen(sched_feat_names[i]);
-
- if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+ if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg)
sysctl_sched_features &= ~(1UL << i);
else
@@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu)
#ifdef CONFIG_NO_HZ
/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+ int cpu = smp_processor_id();
+ int i;
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd))
+ if (!idle_cpu(i))
+ return i;
+ }
+ return cpu;
+}
+/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
* which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
-int nohz_ratelimit(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 diff = rq->clock - rq->nohz_stamp;
-
- rq->nohz_stamp = rq->clock;
-
- return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
-
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p)
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
+
+static void sched_avg_update(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
@@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd)
if (root_task_group_empty())
return;
- now = cpu_clock(raw_smp_processor_id());
+ now = local_clock();
elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
- if (task_has_rt_policy(p)) {
- p->se.load.weight = 0;
- p->se.load.inv_weight = WMULT_CONST;
- return;
- }
-
/*
* SCHED_IDLE tasks get minimal weight:
*/
@@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dec_nr_running(rq);
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+ if (!sched_clock_irqtime)
+ return 0;
+
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+ unsigned long flags;
+ int cpu;
+ u64 now, delta;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ now = sched_clock_cpu(cpu);
+ delta = now - per_cpu(irq_start_time, cpu);
+ per_cpu(irq_start_time, cpu) = now;
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ per_cpu(cpu_hardirq_time, cpu) += delta;
+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ per_cpu(cpu_softirq_time, cpu) += delta;
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+ if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+ u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+ rq->prev_irq_time = curr_irq_time;
+ sched_rt_avg_update(rq, delta_irq);
+ }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+ return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
/*
* __normal_prio - return the priority that is based on the static prio
*/
@@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
if (p->sched_class != &fair_sched_class)
return 0;
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample)
}
#endif
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+ bool is_sync, bool is_migrate, bool is_local,
+ unsigned long en_flags)
+{
+ schedstat_inc(p, se.statistics.nr_wakeups);
+ if (is_sync)
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
+ if (is_migrate)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+ if (is_local)
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ else
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+
+ activate_task(rq, p, en_flags);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+ int wake_flags, bool success)
+{
+ trace_sched_wakeup(p, success);
+ check_preempt_curr(rq, p, wake_flags);
+
+ p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken)
+ p->sched_class->task_woken(rq, p);
+
+ if (unlikely(rq->idle_stamp)) {
+ u64 delta = rq->clock - rq->idle_stamp;
+ u64 max = 2*sysctl_sched_migration_cost;
+
+ if (delta > max)
+ rq->avg_idle = max;
+ else
+ update_avg(&rq->avg_idle, delta);
+ rq->idle_stamp = 0;
+ }
+#endif
+ /* if a worker is waking up, notify workqueue */
+ if ((p->flags & PF_WQ_WORKER) && success)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/**
* try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
* @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
@@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state,
int wake_flags)
@@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
out_activate:
#endif /* CONFIG_SMP */
- schedstat_inc(p, se.statistics.nr_wakeups);
- if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (orig_cpu != cpu)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (cpu == this_cpu)
- schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
- activate_task(rq, p, en_flags);
+ ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+ cpu == this_cpu, en_flags);
success = 1;
-
out_running:
- trace_sched_wakeup(p, success);
- check_preempt_curr(rq, p, wake_flags);
-
- p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
- p->sched_class->task_woken(rq, p);
-
- if (unlikely(rq->idle_stamp)) {
- u64 delta = rq->clock - rq->idle_stamp;
- u64 max = 2*sysctl_sched_migration_cost;
-
- if (delta > max)
- rq->avg_idle = max;
- else
- update_avg(&rq->avg_idle, delta);
- rq->idle_stamp = 0;
- }
-#endif
+ ttwu_post_activation(p, rq, wake_flags, success);
out:
task_rq_unlock(rq, &flags);
put_cpu();
@@ -2399,6 +2565,37 @@ out:
}
/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there. The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task. this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+ bool success = false;
+
+ BUG_ON(rq != this_rq());
+ BUG_ON(p == current);
+ lockdep_assert_held(&rq->lock);
+
+ if (!(p->state & TASK_NORMAL))
+ return;
+
+ if (!p->se.on_rq) {
+ if (likely(!task_running(rq, p))) {
+ schedstat_inc(rq, ttwu_count);
+ schedstat_inc(rq, ttwu_local);
+ }
+ ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+ success = true;
+ }
+ ttwu_post_activation(p, rq, 0, success);
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
- if (likely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (likely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
@@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
*/
static void update_cpu_load(struct rq *this_rq)
{
unsigned long this_load = this_rq->load.weight;
+ unsigned long curr_jiffies = jiffies;
+ unsigned long pending_updates;
int i, scale;
this_rq->nr_load_updates++;
+ /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
/* Update our load: */
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
unsigned long old_load, new_load;
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq)
* example.
*/
if (new_load > old_load)
- new_load += scale-1;
- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
+ sched_avg_update(this_rq);
+}
+
+static void update_cpu_load_active(struct rq *this_rq)
+{
+ update_cpu_load(this_rq);
+
calc_load_account_active(this_rq);
}
@@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock - p->se.exec_start;
+ ns = rq->clock_task - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
@@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
- else if (softirq_count())
+ else if (in_serving_softirq())
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
else
cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
if (total) {
- u64 temp;
+ u64 temp = rtime;
- temp = (u64)(rtime * utime);
+ temp *= utime;
do_div(temp, total);
utime = (cputime_t)temp;
} else
@@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
if (total) {
- u64 temp;
+ u64 temp = rtime;
- temp = (u64)(rtime * cputime.utime);
+ temp *= cputime.utime;
do_div(temp, total);
utime = (cputime_t)temp;
} else
@@ -3426,11 +3710,11 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load(rq);
+ update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
raw_spin_unlock(&rq->lock);
- perf_event_task_tick(curr);
+ perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
@@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq)
return p;
}
- class = sched_class_highest;
- for ( ; ; ) {
+ for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
- /*
- * Will never be NULL as the idle class always
- * returns a non-NULL p:
- */
- class = class->next;
}
+
+ BUG(); /* the idle class will always have a runnable task */
}
/*
@@ -3598,7 +3878,6 @@ need_resched:
rq = cpu_rq(cpu);
rcu_note_context_switch(cpu);
prev = rq->curr;
- switch_count = &prev->nivcsw;
release_kernel_lock(prev);
need_resched_nonpreemptible:
@@ -3611,11 +3890,26 @@ need_resched_nonpreemptible:
raw_spin_lock_irq(&rq->lock);
clear_tsk_need_resched(prev);
+ switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely(signal_pending_state(prev->state, prev)))
+ if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
- else
+ } else {
+ /*
+ * If a worker is going to sleep, notify and
+ * ask workqueue whether it wants to wake up a
+ * task to maintain concurrency. If so, wake
+ * up the task.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev, cpu);
+ if (to_wakeup)
+ try_to_wake_up_local(to_wakeup);
+ }
deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ }
switch_count = &prev->nvcsw;
}
@@ -3637,8 +3931,10 @@ need_resched_nonpreemptible:
context_switch(rq, prev, next); /* unlocks the rq */
/*
- * the context switch might have flipped the stack from under
- * us, hence refresh the local variables.
+ * The context switch have flipped the stack from under us
+ * and restored the local variables which were saved when
+ * this task called schedule() in the past. prev == current
+ * is still correct, but it can be moved to another cpu/rq.
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3647,11 +3943,8 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(current) < 0)) {
- prev = rq->curr;
- switch_count = &prev->nivcsw;
+ if (unlikely(reacquire_kernel_lock(prev)))
goto need_resched_nonpreemptible;
- }
preempt_enable_no_resched();
if (need_resched())
@@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
/*
* Owner changed, break to re-assess state.
*/
- if (lock->owner != owner)
+ if (lock->owner != owner) {
+ /*
+ * If the lock has switched to a different owner,
+ * we likely have heavy contention. Return 0 to quit
+ * optimistic spinning and not contend further:
+ */
+ if (lock->owner)
+ return 0;
break;
+ }
/*
* Is that owner really running on that cpu?
@@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched preempt_schedule(void)
+asmlinkage void __sched notrace preempt_schedule(void)
{
struct thread_info *ti = current_thread_info();
@@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void)
return;
do {
- add_preempt_count(PREEMPT_ACTIVE);
+ add_preempt_count_notrace(PREEMPT_ACTIVE);
schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
+ sub_preempt_count_notrace(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
rq = task_rq_lock(p, &flags);
+ trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->se.on_rq;
@@ -4441,12 +4743,8 @@ recheck:
*/
if (user && !capable(CAP_SYS_NICE)) {
if (rt_policy(policy)) {
- unsigned long rlim_rtprio;
-
- if (!lock_task_sighand(p, &flags))
- return -ESRCH;
- rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
- unlock_task_sighand(p, &flags);
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
/* can't set/change the rt policy */
if (policy != p->policy && !rlim_rtprio)
@@ -4474,7 +4772,7 @@ recheck:
}
if (user) {
- retval = security_task_setscheduler(p, policy, param);
+ retval = security_task_setscheduler(p);
if (retval)
return retval;
}
@@ -4490,6 +4788,15 @@ recheck:
*/
rq = __task_rq_lock(p);
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EINVAL;
+ }
+
#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
/*
@@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
- retval = security_task_setscheduler(p, 0, NULL);
+ retval = security_task_setscheduler(p);
if (retval)
goto out_unlock;
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
retval = set_cpus_allowed_ptr(p, new_mask);
if (!retval) {
@@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->se.exec_start = sched_clock();
cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ /*
+ * We're having a chicken and egg problem, even though we are
+ * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * lockdep check in task_group() will fail.
+ *
+ * Similar case to sched_fork(). / Alternatively we could
+ * use task_rq_lock() here and obtain the other rq->lock.
+ *
+ * Silence PROVE_RCU
+ */
+ rcu_read_lock();
__set_task_cpu(idle, cpu);
+ rcu_read_unlock();
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
- .priority = 10
+ .priority = CPU_PRI_MIGRATION,
};
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ set_cpu_active((long)hcpu, true);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ set_cpu_active((long)hcpu, false);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
static int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
- /* Start one for the boot CPU: */
+ /* Initialize migration for the boot CPU */
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
+ /* Register cpu active notifiers */
+ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
return 0;
}
early_initcall(migration_init);
@@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
free_rootdomain(old_rd);
}
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
{
- gfp_t gfp = GFP_KERNEL;
-
memset(rd, 0, sizeof(*rd));
- if (bootmem)
- gfp = GFP_NOWAIT;
-
- if (!alloc_cpumask_var(&rd->span, gfp))
+ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
- if (!alloc_cpumask_var(&rd->online, gfp))
+ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_online;
- if (cpupri_init(&rd->cpupri, bootmem) != 0)
+ if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
return 0;
@@ -6096,7 +6439,7 @@ out:
static void init_defrootdomain(void)
{
- init_rootdomain(&def_root_domain, true);
+ init_rootdomain(&def_root_domain);
atomic_set(&def_root_domain.refcount, 1);
}
@@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void)
if (!rd)
return NULL;
- if (init_rootdomain(rd, false) != 0) {
+ if (init_rootdomain(rd) != 0) {
kfree(rd);
return NULL;
}
@@ -6319,6 +6662,7 @@ struct s_data {
cpumask_var_t nodemask;
cpumask_var_t this_sibling_map;
cpumask_var_t this_core_map;
+ cpumask_var_t this_book_map;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
struct sched_group **sched_group_nodes;
@@ -6330,6 +6674,7 @@ enum s_alloc {
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
+ sa_this_book_map,
sa_this_core_map,
sa_this_sibling_map,
sa_nodemask,
@@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
static int
cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{
int group;
-
+#ifdef CONFIG_SCHED_SMT
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
+#else
+ group = cpu;
+#endif
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
return group;
}
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *mask)
{
+ int group = cpu;
+#ifdef CONFIG_SCHED_MC
+ cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#endif
if (sg)
- *sg = &per_cpu(sched_group_core, cpu).sg;
- return cpu;
+ *sg = &per_cpu(sched_group_book, group).sg;
+ return group;
}
-#endif
+#endif /* CONFIG_SCHED_BOOK */
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{
int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+ cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+ group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
@@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU)
#ifdef CONFIG_SCHED_MC
SD_INIT_FUNC(MC)
#endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
static int default_relax_domain_level = -1;
@@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
free_cpumask_var(d->tmpmask); /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
+ case sa_this_book_map:
+ free_cpumask_var(d->this_book_map); /* fall through */
case sa_this_core_map:
free_cpumask_var(d->this_core_map); /* fall through */
case sa_this_sibling_map:
@@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
return sa_nodemask;
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
return sa_this_core_map;
+ if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ return sa_this_book_map;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
d->rd = alloc_rootdomain();
@@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
return sd;
}
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+ sd = &per_cpu(book_domains, i).sd;
+ SD_INIT(sd, BOOK);
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+ sd->parent = parent;
+ parent->child = sd;
+ cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+ return sd;
+}
+
static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i)
@@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
d->send_covered, d->tmpmask);
break;
#endif
+#ifdef CONFIG_SCHED_BOOK
+ case SD_LV_BOOK: /* set up book groups */
+ cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+ if (cpu == cpumask_first(d->this_book_map))
+ init_sched_build_groups(d->this_book_map, cpu_map,
+ &cpu_to_book_group,
+ d->send_covered, d->tmpmask);
+ break;
+#endif
case SD_LV_CPU: /* set up physical groups */
cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
if (!cpumask_empty(d->nodemask))
@@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+ sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
}
for_each_cpu(i, cpu_map) {
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+ build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
}
@@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
init_sched_groups_power(i, sd);
}
#endif
+#ifdef CONFIG_SCHED_BOOK
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(book_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }
+#endif
for_each_cpu(i, cpu_map) {
sd = &per_cpu(phys_domains, i).sd;
@@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC)
sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+ sd = &per_cpu(book_domains, i).sd;
#else
sd = &per_cpu(phys_domains, i).sd;
#endif
@@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-#ifndef CONFIG_CPUSETS
/*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
*/
-static int update_sched_domains(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- partition_sched_domains(1, NULL, NULL);
+ cpuset_update_active_cpus();
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ cpuset_update_active_cpus();
+ return NOTIFY_OK;
default:
return NOTIFY_DONE;
}
}
-#endif
static int update_runtime(struct notifier_block *nfb,
unsigned long action, void *hcpu)
@@ -7356,10 +7770,8 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
-#ifndef CONFIG_CPUSETS
- /* XXX: Theoretical race here - CPU may be hotplugged now */
- hotcpu_notifier(update_sched_domains, 0);
-#endif
+ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
/* RT runtime code needs to handle some hotplug events */
hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +8016,9 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
+
+ rq->last_load_update_tick = jiffies;
+
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7617,6 +8032,10 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+ rq->nohz_balance_kick = 0;
+ init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +8080,11 @@ void __init sched_init(void)
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
- alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+ atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
@@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
- err_free_rq:
+err_free_rq:
kfree(cfs_rq);
- err:
+err:
return 0;
}
@@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
- err_free_rq:
+err_free_rq:
kfree(rt_rq);
- err:
+err:
return 0;
}
@@ -8088,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);