summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c214
1 files changed, 119 insertions, 95 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index fde6ff903525..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched_cpupri.h"
#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
static inline int rt_policy(int policy)
{
- if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
return 1;
return 0;
}
@@ -422,6 +425,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
* one runnable RT task.
*/
cpumask_var_t rto_mask;
- atomic_t rto_count;
struct cpupri cpupri;
};
@@ -528,6 +531,12 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
/* calc_load related fields */
unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return rq->avg_load_per_task;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
- unsigned long load;
- long cpu = (long)data;
-
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
-
- tg->cfs_rq[cpu]->h_load = load;
-
- return 0;
-}
-
-static void update_h_load(long cpu)
-{
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
#ifdef CONFIG_PREEMPT
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
{
- s64 irq_delta;
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
@@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_branch((&paravirt_steal_rq_enabled))) {
+ u64 st;
+
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ st = steal_ticks(steal);
+ steal = st * TICK_NSEC;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+
rq->clock_task += delta;
- if (irq_delta && sched_feat(NONIRQ_POWER))
- sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int irqtime_account_hi_update(void)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void)
#define sched_clock_irqtime (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
- rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -2220,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_cpu(p) != new_cpu) {
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+ perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
__set_task_cpu(p, new_cpu);
@@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
- if (unlikely(rq->idle_stamp)) {
+ if (rq->idle_stamp) {
u64 delta = rq->clock - rq->idle_stamp;
u64 max = 2*sysctl_sched_migration_cost;
@@ -2886,7 +2898,7 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
@@ -3877,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
cpustat->idle = cputime64_add(cpustat->idle, cputime64);
}
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_branch(&paravirt_steal_enabled)) {
+ u64 steal, st = 0;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ st = steal_ticks(steal);
+ this_rq()->prev_steal_time += st * TICK_NSEC;
+
+ account_steal_time(st);
+ return st;
+ }
+#endif
+ return false;
+}
+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3908,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ if (steal_account_process_tick())
+ return;
+
if (irqtime_account_hi_update()) {
cpustat->irq = cputime64_add(cpustat->irq, tmp);
} else if (irqtime_account_si_update()) {
@@ -3961,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
+ if (steal_account_process_tick())
+ return;
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4338,11 +4375,8 @@ EXPORT_SYMBOL(schedule);
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
- bool ret = false;
-
- rcu_read_lock();
if (lock->owner != owner)
- goto fail;
+ return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4352,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
*/
barrier();
- ret = owner->on_cpu;
-fail:
- rcu_read_unlock();
-
- return ret;
+ return owner->on_cpu;
}
/*
@@ -4368,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
if (!sched_feat(OWNER_SPIN))
return 0;
+ rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
- return 0;
+ break;
arch_mutex_cpu_relax();
}
+ rcu_read_unlock();
/*
- * If the owner changed to another task there is likely
- * heavy contention, stop spinning.
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
*/
- if (lock->owner)
- return 0;
-
- return 1;
+ return lock->owner == NULL;
}
#endif
@@ -7898,17 +7928,10 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->rq = rq;
- /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
- cfs_rq->load_stamp = 1;
-#endif
-#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7928,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
- plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+ plist_head_init(&rt_rq->pushable_tasks);
#endif
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
- rt_rq->rt_nr_boosted = 0;
- rt_rq->rq = rq;
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7957,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
- tg->cfs_rq[cpu] = cfs_rq;
- init_cfs_rq(cfs_rq, rq);
+
cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
+#endif
+ tg->cfs_rq[cpu] = cfs_rq;
tg->se[cpu] = se;
+
/* se could be NULL for root_task_group */
if (!se)
return;
@@ -7984,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- tg->rt_rq[cpu] = rt_rq;
- init_rt_rq(rt_rq, rq);
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
rt_rq->tg = tg;
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ tg->rt_rq[cpu] = rt_rq;
tg->rt_se[cpu] = rt_se;
+
if (!rt_se)
return;
@@ -8071,7 +8093,7 @@ void __init sched_init(void)
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
- init_cfs_rq(&rq->cfs, rq);
+ init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = root_task_group_load;
@@ -8142,7 +8164,7 @@ void __init sched_init(void)
#endif
#ifdef CONFIG_RT_MUTEXES
- plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+ plist_head_init(&init_task.pi_waiters);
#endif
/*
@@ -8185,7 +8207,7 @@ void __init sched_init(void)
scheduler_running = 1;
}
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8195,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
-#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8217,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
-#endif
}
EXPORT_SYMBOL(__might_sleep);
#endif
@@ -8376,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
+ init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
@@ -8403,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
{
}
@@ -8424,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
{
int i;
- destroy_rt_bandwidth(&tg->rt_bandwidth);
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
for_each_possible_cpu(i) {
if (tg->rt_rq)
@@ -8465,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
+ init_rt_rq(rt_rq, cpu_rq(i));
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}