diff options
Diffstat (limited to 'kernel')
63 files changed, 3408 insertions, 1130 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index ebdb0043203a..84d882f3e299 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -225,7 +225,7 @@ config ARCH_SUPPORTS_ATOMIC_RMW config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config RWSEM_SPIN_ON_OWNER def_bool y diff --git a/kernel/audit.c b/kernel/audit.c index f1ca11613379..67b9fbd871be 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -static int audit_net_id; +static unsigned int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1172,9 +1172,8 @@ static void __net_exit audit_net_exit(struct net *net) audit_sock = NULL; } - RCU_INIT_POINTER(aunet->nlsk, NULL); - synchronize_net(); netlink_kernel_release(sock); + aunet->nlsk = NULL; } static struct pernet_operations audit_net_ops __net_initdata = { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index eed911d091da..1276474ac3cd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,8 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c new file mode 100644 index 000000000000..89b7ef41c86b --- /dev/null +++ b/kernel/bpf/bpf_lru_list.c @@ -0,0 +1,695 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include "bpf_lru_list.h" + +#define LOCAL_FREE_TARGET (128) +#define LOCAL_NR_SCANS LOCAL_FREE_TARGET + +#define PERCPU_FREE_TARGET (16) +#define PERCPU_NR_SCANS PERCPU_FREE_TARGET + +/* Helpers to get the local list index */ +#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) +#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) + +static int get_next_cpu(int cpu) +{ + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_possible_mask); + return cpu; +} + +/* Local list helpers */ +static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +} + +static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +} + +/* bpf_lru_node helpers */ +static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +{ + return node->ref; +} + +static void bpf_lru_list_count_inc(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]++; +} + +static void bpf_lru_list_count_dec(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]--; +} + +static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, + struct bpf_lru_node *node, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + /* If the removing node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + bpf_lru_list_count_dec(l, node->type); + + node->type = tgt_free_type; + list_move(&node->list, free_list); +} + +/* Move nodes from local list to the LRU list */ +static void __bpf_lru_node_move_in(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + node->ref = 0; + list_move(&node->list, &l->lists[tgt_type]); +} + +/* Move nodes between or within active and inactive list (like + * active to inactive, inactive to active or tail of active back to + * the head of active). + */ +static void __bpf_lru_node_move(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + if (node->type != tgt_type) { + bpf_lru_list_count_dec(l, node->type); + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + } + node->ref = 0; + + /* If the moving node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + list_move(&node->list, &l->lists[tgt_type]); +} + +static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +{ + return l->counts[BPF_LRU_LIST_T_INACTIVE] < + l->counts[BPF_LRU_LIST_T_ACTIVE]; +} + +/* Rotate the active list: + * 1. Start from tail + * 2. If the node has the ref bit set, it will be rotated + * back to the head of active list with the ref bit cleared. + * Give this node one more chance to survive in the active list. + * 3. If the ref bit is not set, move it to the head of the + * inactive list. + * 4. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int i = 0; + + first_node = list_first_entry(active, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, active, list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + + if (++i == lru->nr_scans || node == first_node) + break; + } +} + +/* Rotate the inactive list. It starts from the next_inactive_rotation + * 1. If the node has ref bit set, it will be moved to the head + * of active list with the ref bit cleared. + * 2. If the node does not have ref bit set, it will leave it + * at its current location (i.e. do nothing) so that it can + * be considered during the next inactive_shrink. + * 3. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct list_head *cur, *last, *next = inactive; + struct bpf_lru_node *node; + unsigned int i = 0; + + if (list_empty(inactive)) + return; + + last = l->next_inactive_rotation->next; + if (last == inactive) + last = last->next; + + cur = l->next_inactive_rotation; + while (i < lru->nr_scans) { + if (cur == inactive) { + cur = cur->prev; + continue; + } + + node = list_entry(cur, struct bpf_lru_node, list); + next = cur->prev; + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + if (cur == last) + break; + cur = next; + i++; + } + + l->next_inactive_rotation = next; +} + +/* Shrink the inactive list. It starts from the tail of the + * inactive list and only move the nodes without the ref bit + * set to the designated free list. + */ +static unsigned int +__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int nshrinked = 0; + unsigned int i = 0; + + first_node = list_first_entry(inactive, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { + if (bpf_lru_node_is_ref(node)) { + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + } else if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + if (++nshrinked == tgt_nshrink) + break; + } + + if (++i == lru->nr_scans) + break; + } + + return nshrinked; +} + +/* 1. Rotate the active list (if needed) + * 2. Always rotate the inactive list + */ +static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +{ + if (bpf_lru_list_inactive_low(l)) + __bpf_lru_list_rotate_active(lru, l); + + __bpf_lru_list_rotate_inactive(lru, l); +} + +/* Calls __bpf_lru_list_shrink_inactive() to shrink some + * ref-bit-cleared nodes and move them to the designated + * free list. + * + * If it cannot get a free node after calling + * __bpf_lru_list_shrink_inactive(). It will just remove + * one node from either inactive or active list without + * honoring the ref-bit. It prefers inactive list to active + * list in this situation. + */ +static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) + +{ + struct bpf_lru_node *node, *tmp_node; + struct list_head *force_shrink_list; + unsigned int nshrinked; + + nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, + free_list, tgt_free_type); + if (nshrinked) + return nshrinked; + + /* Do a force shrink by ignoring the reference bit */ + if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) + force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + else + force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + + list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, + list) { + if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + return 1; + } + } + + return 0; +} + +/* Flush the nodes from the local pending list to the LRU list */ +static void __local_list_flush(struct bpf_lru_list *l, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node, *tmp_node; + + list_for_each_entry_safe_reverse(node, tmp_node, + local_pending_list(loc_l), list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move_in(l, node, + BPF_LRU_LIST_T_INACTIVE); + } +} + +static void bpf_lru_list_push_free(struct bpf_lru_list *l, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + raw_spin_lock_irqsave(&l->lock, flags); + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + struct bpf_lru_node *node, *tmp_node; + unsigned int nfree = 0; + + raw_spin_lock(&l->lock); + + __local_list_flush(l, loc_l); + + __bpf_lru_list_rotate(lru, l); + + list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], + list) { + __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + if (++nfree == LOCAL_FREE_TARGET) + break; + } + + if (nfree < LOCAL_FREE_TARGET) + __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + + raw_spin_unlock(&l->lock); +} + +static void __local_list_add_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l, + int cpu, + struct bpf_lru_node *node, + u32 hash) +{ + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->cpu = cpu; + node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + node->ref = 0; + list_add(&node->list, local_pending_list(loc_l)); +} + +struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(local_free_list(loc_l), + struct bpf_lru_node, + list); + if (node) + list_del(&node->list); + + return node; +} + +struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + bool force = false; + +ignore_ref: + /* Get from the tail (i.e. older element) of the pending list. */ + list_for_each_entry_reverse(node, local_pending_list(loc_l), + list) { + if ((!bpf_lru_node_is_ref(node) || force) && + lru->del_from_htab(lru->del_arg, node)) { + list_del(&node->list); + return node; + } + } + + if (!force) { + force = true; + goto ignore_ref; + } + + return NULL; +} + +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct list_head *free_list; + struct bpf_lru_node *node = NULL; + struct bpf_lru_list *l; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + l = per_cpu_ptr(lru->percpu_lru, cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_list_rotate(lru, l); + + free_list = &l->lists[BPF_LRU_LIST_T_FREE]; + if (list_empty(free_list)) + __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, + BPF_LRU_LIST_T_FREE); + + if (!list_empty(free_list)) { + node = list_first_entry(free_list, struct bpf_lru_node, list); + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->ref = 0; + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + } + + raw_spin_unlock_irqrestore(&l->lock, flags); + + return node; +} + +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct bpf_lru_locallist *loc_l, *steal_loc_l; + struct bpf_common_lru *clru = &lru->common_lru; + struct bpf_lru_node *node; + int steal, first_steal; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + loc_l = per_cpu_ptr(clru->local_list, cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + node = __local_list_pop_free(loc_l); + if (!node) { + bpf_lru_list_pop_free_to_local(lru, loc_l); + node = __local_list_pop_free(loc_l); + } + + if (node) + __local_list_add_pending(lru, loc_l, cpu, node, hash); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + + if (node) + return node; + + /* No free nodes found from the local free list and + * the global LRU list. + * + * Steal from the local free/pending list of the + * current CPU and remote CPU in RR. It starts + * with the loc_l->next_steal CPU. + */ + + first_steal = loc_l->next_steal; + steal = first_steal; + do { + steal_loc_l = per_cpu_ptr(clru->local_list, steal); + + raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + + raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + + steal = get_next_cpu(steal); + } while (!node && steal != first_steal); + + loc_l->next_steal = steal; + + if (node) { + raw_spin_lock_irqsave(&loc_l->lock, flags); + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + } + + return node; +} + +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ + if (lru->percpu) + return bpf_percpu_lru_pop_free(lru, hash); + else + return bpf_common_lru_pop_free(lru, hash); +} + +static void bpf_common_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + return; + + if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + goto check_lru_list; + } + + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + node->ref = 0; + list_move(&node->list, local_free_list(loc_l)); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + return; + } + +check_lru_list: + bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +} + +static void bpf_percpu_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + struct bpf_lru_list *l; + unsigned long flags; + + l = per_cpu_ptr(lru->percpu_lru, node->cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ + if (lru->percpu) + bpf_percpu_lru_push_free(lru, node); + else + bpf_common_lru_push_free(lru, node); +} + +void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + u32 i; + + for (i = 0; i < nr_elems; i++) { + struct bpf_lru_node *node; + + node = (struct bpf_lru_node *)(buf + node_offset); + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + buf += elem_size; + } +} + +void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + u32 i, pcpu_entries; + int cpu; + struct bpf_lru_list *l; + + pcpu_entries = nr_elems / num_possible_cpus(); + + i = 0; + + for_each_possible_cpu(cpu) { + struct bpf_lru_node *node; + + l = per_cpu_ptr(lru->percpu_lru, cpu); +again: + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + if (lru->percpu) + bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); + else + bpf_common_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); +} + +static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) + INIT_LIST_HEAD(&loc_l->lists[i]); + + loc_l->next_steal = cpu; + + raw_spin_lock_init(&loc_l->lock); +} + +static void bpf_lru_list_init(struct bpf_lru_list *l) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LIST_T; i++) + INIT_LIST_HEAD(&l->lists[i]); + + for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) + l->counts[i] = 0; + + l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + + raw_spin_lock_init(&l->lock); +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *del_arg) +{ + int cpu; + + if (percpu) { + lru->percpu_lru = alloc_percpu(struct bpf_lru_list); + if (!lru->percpu_lru) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_list *l; + + l = per_cpu_ptr(lru->percpu_lru, cpu); + bpf_lru_list_init(l); + } + lru->nr_scans = PERCPU_NR_SCANS; + } else { + struct bpf_common_lru *clru = &lru->common_lru; + + clru->local_list = alloc_percpu(struct bpf_lru_locallist); + if (!clru->local_list) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + bpf_lru_locallist_init(loc_l, cpu); + } + + bpf_lru_list_init(&clru->lru_list); + lru->nr_scans = LOCAL_NR_SCANS; + } + + lru->percpu = percpu; + lru->del_from_htab = del_from_htab; + lru->del_arg = del_arg; + lru->hash_offset = hash_offset; + + return 0; +} + +void bpf_lru_destroy(struct bpf_lru *lru) +{ + if (lru->percpu) + free_percpu(lru->percpu_lru); + else + free_percpu(lru->common_lru.local_list); +} diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h new file mode 100644 index 000000000000..5c35a98d02bf --- /dev/null +++ b/kernel/bpf/bpf_lru_list.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __BPF_LRU_LIST_H_ +#define __BPF_LRU_LIST_H_ + +#include <linux/list.h> +#include <linux/spinlock_types.h> + +#define NR_BPF_LRU_LIST_T (3) +#define NR_BPF_LRU_LIST_COUNT (2) +#define NR_BPF_LRU_LOCAL_LIST_T (2) +#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T + +enum bpf_lru_list_type { + BPF_LRU_LIST_T_ACTIVE, + BPF_LRU_LIST_T_INACTIVE, + BPF_LRU_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_PENDING, +}; + +struct bpf_lru_node { + struct list_head list; + u16 cpu; + u8 type; + u8 ref; +}; + +struct bpf_lru_list { + struct list_head lists[NR_BPF_LRU_LIST_T]; + unsigned int counts[NR_BPF_LRU_LIST_COUNT]; + /* The next inacitve list rotation starts from here */ + struct list_head *next_inactive_rotation; + + raw_spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct bpf_lru_locallist { + struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + u16 next_steal; + raw_spinlock_t lock; +}; + +struct bpf_common_lru { + struct bpf_lru_list lru_list; + struct bpf_lru_locallist __percpu *local_list; +}; + +typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); + +struct bpf_lru { + union { + struct bpf_common_lru common_lru; + struct bpf_lru_list __percpu *percpu_lru; + }; + del_from_htab_func del_from_htab; + void *del_arg; + unsigned int hash_offset; + unsigned int nr_scans; + bool percpu; +}; + +static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +{ + /* ref is an approximation on access frequency. It does not + * have to be very accurate. Hence, no protection is used. + */ + node->ref = 1; +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *delete_arg); +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems); +void bpf_lru_destroy(struct bpf_lru *lru); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); + +#endif diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 000000000000..a515f7b007c6 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,200 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/bpf-cgroup.h> +#include <net/sock.h> + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { + struct bpf_prog *prog = cgrp->bpf.prog[type]; + + if (prog) { + bpf_prog_put(prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } + } +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { + struct bpf_prog *e; + + e = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + rcu_assign_pointer(cgrp->bpf.effective[type], e); + } +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is not %NULL, this function attaches a new program to the cgroup + * and releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct bpf_prog *old_prog, *effective; + struct cgroup_subsys_state *pos; + + old_prog = xchg(cgrp->bpf.prog + type, prog); + + effective = (!prog && parent) ? + rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)) : + prog; + + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *desc = container_of(pos, struct cgroup, self); + + /* skip the subtree if the descendant has its own program */ + if (desc->bpf.prog[type] && desc != cgrp) + pos = css_rightmost_descendant(pos); + else + rcu_assign_pointer(desc->bpf.effective[type], + effective); + } + + if (prog) + static_branch_inc(&cgroup_bpf_enabled_key); + + if (old_prog) { + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } +} + +/** + * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_skb(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret = 0; + + if (!sk || !sk_fullsock(sk)) + return 0; + + if (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) { + unsigned int offset = skb->data - skb_network_header(skb); + + __skb_push(skb, offset); + ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; + __skb_pull(skb, offset); + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa6d98154106..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +#define SHA_BPF_RAW_SIZE \ + round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) + +/* Called under verifier mutex. */ +void bpf_prog_calc_digest(struct bpf_prog *fp) +{ + const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + static u32 ws[SHA_WORKSPACE_WORDS]; + static u8 raw[SHA_BPF_RAW_SIZE]; + struct bpf_insn *dst = (void *)raw; + u32 i, bsize, psize, blocks; + bool was_ld_map; + u8 *todo = raw; + __be32 *result; + __be64 *bits; + + sha_init(fp->digest); + memset(ws, 0, sizeof(ws)); + + /* We need to take out the map fd for the digest calculation + * since they are unstable from user space side. + */ + for (i = 0, was_ld_map = false; i < fp->len; i++) { + dst[i] = fp->insnsi[i]; + if (!was_ld_map && + dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + was_ld_map = true; + dst[i].imm = 0; + } else if (was_ld_map && + dst[i].code == 0 && + dst[i].dst_reg == 0 && + dst[i].src_reg == 0 && + dst[i].off == 0) { + was_ld_map = false; + dst[i].imm = 0; + } else { + was_ld_map = false; + } + } + + psize = fp->len * sizeof(struct bpf_insn); + memset(&raw[psize], 0, sizeof(raw) - psize); + raw[psize++] = 0x80; + + bsize = round_up(psize, SHA_MESSAGE_BYTES); + blocks = bsize / SHA_MESSAGE_BYTES; + if (bsize - psize >= sizeof(__be64)) { + bits = (__be64 *)(todo + bsize - sizeof(__be64)); + } else { + bits = (__be64 *)(todo + bsize + bits_offset); + blocks++; + } + *bits = cpu_to_be64((psize - 1) << 3); + + while (blocks--) { + sha_transform(fp->digest, todo, ws); + todo += SHA_MESSAGE_BYTES; + } + + result = (__force __be32 *)fp->digest; + for (i = 0; i < SHA_DIGEST_WORDS; i++) + result[i] = cpu_to_be32(fp->digest[i]); +} + static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && @@ -1043,6 +1108,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; @@ -1077,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) return prog; } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include <linux/filter.h> #include <linux/vmalloc.h> #include "percpu_freelist.h" +#include "bpf_lru_list.h" struct bucket { struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab { struct bpf_map map; struct bucket *buckets; void *elems; - struct pcpu_freelist freelist; + union { + struct pcpu_freelist freelist; + struct bpf_lru lru; + }; void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ @@ -48,11 +52,26 @@ struct htab_elem { union { struct rcu_head rcu; enum extra_elem_state state; + struct bpf_lru_node lru_node; }; u32 hash; char key[0] __aligned(8); }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) { int i; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -87,7 +106,22 @@ free_elems: vfree(htab->elems); } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, + u32 hash) +{ + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct htab_elem *l; + + if (node) { + l = container_of(node, struct htab_elem, lru_node); + memcpy(l->key, key, htab->map.key_size); + return l; + } + + return NULL; +} + +static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; @@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) if (!htab->elems) return -ENOMEM; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) } skip_percpu_elems: - err = pcpu_freelist_init(&htab->freelist); + if (htab_is_lru(htab)) + err = bpf_lru_init(&htab->lru, + htab->map.map_flags & BPF_F_NO_COMMON_LRU, + offsetof(struct htab_elem, hash) - + offsetof(struct htab_elem, lru_node), + htab_lru_map_delete_node, + htab); + else + err = pcpu_freelist_init(&htab->freelist); + if (err) goto free_elems; - pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, - htab->map.max_entries); + if (htab_is_lru(htab)) + bpf_lru_populate(&htab->lru, htab->elems, + offsetof(struct htab_elem, lru_node), + htab->elem_size, htab->map.max_entries); + else + pcpu_freelist_populate(&htab->freelist, htab->elems, + htab->elem_size, htab->map.max_entries); + return 0; free_elems: @@ -123,6 +172,16 @@ free_elems: return err; } +static void prealloc_destroy(struct bpf_htab *htab) +{ + htab_free_elems(htab); + + if (htab_is_lru(htab)) + bpf_lru_destroy(&htab->lru); + else + pcpu_freelist_destroy(&htab->freelist); +} + static int alloc_extra_elems(struct bpf_htab *htab) { void __percpu *pptr; @@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab) /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { - bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; u64 cost; - if (attr->map_flags & ~BPF_F_NO_PREALLOC) + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return ERR_PTR(-EPERM); + + if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); + if (!lru && percpu_lru) + return ERR_PTR(-EINVAL); + + if (lru && !prealloc) + return ERR_PTR(-ENOTSUPP); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size == 0) goto free_htab; + if (percpu_lru) { + /* ensure each CPU's lru list has >=1 elements. + * since we are at it, make each lru list has the same + * number of elements. + */ + htab->map.max_entries = roundup(attr->max_entries, + num_possible_cpus()); + if (htab->map.max_entries < attr->max_entries) + htab->map.max_entries = rounddown(attr->max_entries, + num_possible_cpus()); + } + /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu) { + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ err = alloc_extra_elems(htab); if (err) goto free_buckets; } - if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { - err = prealloc_elems_and_freelist(htab); + if (prealloc) { + err = prealloc_init(htab); if (err) goto free_extra_elems; } @@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ + struct bpf_htab *htab = (struct bpf_htab *)arg; + struct htab_elem *l, *tgt_l; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + + tgt_l = container_of(node, struct htab_elem, lru_node); + b = __select_bucket(htab, tgt_l->hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l == tgt_l) { + hlist_del_rcu(&l->hash_node); + break; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + + return l == tgt_l; +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + u32 size = round_up(htab->map.value_size, 8); + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, @@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - if (!onallcpus) { - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); - } else { - int off = 0, cpu; + pcpu_copy_value(htab, pptr, value, onallcpus); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else { @@ -571,6 +715,70 @@ err: return ret; } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old = NULL; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because getting free nodes from LRU may need + * to remove older elements from htab and this removal + * operation will need a bucket lock. + */ + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + hlist_del_rcu(&l_old->hash_node); + } + ret = 0; + +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + + if (ret) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + else if (l_old) + bpf_lru_push_free(&htab->lru, &l_old->lru_node); + + return ret; +} + static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) @@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); - u32 size = htab->map.value_size; - /* per-cpu hash map can update value in-place */ - if (!onallcpus) { - memcpy(this_cpu_ptr(pptr), value, size); - } else { - int off = 0, cpu; - - size = round_up(size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, false); @@ -637,12 +832,84 @@ err: return ret; } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because LRU's elem alloc may need + * to remove older elem from htab and this removal + * operation will need a bucket lock. + */ + if (map_flags != BPF_EXIST) { + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + } + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_old->lru_node); + + /* per-cpu hash map can update value in-place */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + value, onallcpus); + hlist_add_head_rcu(&l_new->hash_node, head); + l_new = NULL; + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l_new) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + return ret; +} + static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, + false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + ret = 0; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + return ret; +} + static void delete_all_elements(struct bpf_htab *htab) { int i; @@ -687,7 +987,8 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab_elem_free(htab, l); + if (l->state != HTAB_EXTRA_ELEM_USED) + htab_elem_free(htab, l); } } } @@ -707,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + if (htab->map.map_flags & BPF_F_NO_PREALLOC) delete_all_elements(htab); - } else { - htab_free_elems(htab); - pcpu_freelist_destroy(&htab->freelist); - } + else + prealloc_destroy(htab); + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); @@ -732,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +static const struct bpf_map_ops htab_lru_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { + .ops = &htab_lru_ops, + .type = BPF_MAP_TYPE_LRU_HASH, +}; + /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -743,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -760,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + if (htab_is_lru(htab)) + bpf_lru_node_set_ref(&l->lru_node); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, @@ -775,10 +1104,16 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + if (htab_is_lru(htab)) + ret = __htab_lru_percpu_map_update_elem(map, key, value, + map_flags, true); + else + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, + true); rcu_read_unlock(); return ret; @@ -798,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { .type = BPF_MAP_TYPE_PERCPU_HASH, }; +static const struct bpf_map_ops htab_lru_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_update_elem = htab_lru_percpu_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { + .ops = &htab_lru_percpu_ops, + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); + bpf_register_map_type(&htab_lru_type); + bpf_register_map_type(&htab_lru_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 39918402e6e9..045cbe673356 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,6 +13,7 @@ #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> +#include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> @@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_numa_node_id) +{ + return numa_node_id(); +} + +const struct bpf_func_proto bpf_get_numa_node_id_proto = { + .func = bpf_get_numa_node_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1ed8473ec537..0b030c9126d3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -18,6 +18,7 @@ #include <linux/namei.h> #include <linux/fs.h> #include <linux/kdev_t.h> +#include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> @@ -87,6 +88,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, switch (mode & S_IFMT) { case S_IFDIR: case S_IFREG: + case S_IFLNK: break; default: return ERR_PTR(-EINVAL); @@ -119,6 +121,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } +static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + d_instantiate(dentry, inode); + dget(dentry); + + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; +} + static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -133,9 +145,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(inode); inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -151,9 +161,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, inode->i_op = iops; inode->i_private = dentry->d_fsdata; - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -181,13 +189,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } +static int bpf_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); + struct inode *inode; + + if (!link) + return -ENOMEM; + + inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); + if (IS_ERR(inode)) { + kfree(link); + return PTR_ERR(inode); + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; + + bpf_dentry_finalize(dentry, inode, dir); + return 0; +} + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, + .symlink = bpf_symlink, .rmdir = simple_rmdir, .rename = simple_rename, .link = simple_link, @@ -324,6 +356,8 @@ static void bpf_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); } @@ -331,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode) static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, .evict_inode = bpf_evict_inode, }; +enum { + OPT_MODE, + OPT_ERR, +}; + +static const match_table_t bpf_mount_tokens = { + { OPT_MODE, "mode=%o" }, + { OPT_ERR, NULL }, +}; + +struct bpf_mount_opts { + umode_t mode; +}; + +static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option, token; + char *ptr; + + opts->mode = S_IRWXUGO; + + while ((ptr = strsep(&data, ",")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, bpf_mount_tokens, args); + switch (token) { + case OPT_MODE: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* We might like to report bad mount options here, but + * traditionally we've ignored all mount options, so we'd + * better continue to ignore non-existing options for bpf. + */ + } + } + + return 0; +} + static int bpf_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr bpf_rfiles[] = { { "" } }; + struct bpf_mount_opts opts; struct inode *inode; int ret; + save_mount_options(sb, data); + + ret = bpf_parse_options(data, &opts); + if (ret) + return ret; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -349,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | S_IRWXUGO; + inode->i_mode |= S_ISVTX | opts.mode; return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,6 +17,7 @@ #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> +#include <linux/kernel.h> DEFINE_PER_CPU(int, bpf_prog_active); @@ -137,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; + const struct bpf_array *array; + u32 owner_prog_type = 0; + + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + array = container_of(map, struct bpf_array, map); + owner_prog_type = array->owner_prog_type; + } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" - "map_flags:\t%#x\n", + "map_flags:\t%#x\n" + "memlock:\t%llu\n", map->map_type, map->key_size, map->value_size, map->max_entries, - map->map_flags); + map->map_flags, + map->pages * 1ULL << PAGE_SHIFT); + + if (owner_prog_type) + seq_printf(m, "owner_prog_type:\t%u\n", + owner_prog_type); } #endif @@ -194,7 +208,7 @@ static int map_create(union bpf_attr *attr) err = bpf_map_charge_memlock(map); if (err) - goto free_map; + goto free_map_nouncharge; err = bpf_map_new_fd(map); if (err < 0) @@ -204,6 +218,8 @@ static int map_create(union bpf_attr *attr) return err; free_map: + bpf_map_uncharge_memlock(map); +free_map_nouncharge: map->ops->map_free(map); return err; } @@ -252,12 +268,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* helper to convert user pointers passed inside __aligned_u64 fields */ -static void __user *u64_to_ptr(__u64 val) -{ - return (void __user *) (unsigned long) val; -} - int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -268,8 +278,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static int map_lookup_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; @@ -295,6 +305,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -305,7 +316,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -342,8 +354,8 @@ err_put: static int map_update_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; @@ -369,6 +381,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -388,7 +401,8 @@ static int map_update_elem(union bpf_attr *attr) */ preempt_disable(); __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); @@ -420,7 +434,7 @@ err_put: static int map_delete_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); + void __user *ukey = u64_to_user_ptr(attr->key); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -464,8 +478,8 @@ err_put: static int map_get_next_key(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *unext_key = u64_to_ptr(attr->next_key); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *unext_key = u64_to_user_ptr(attr->next_key); int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; @@ -565,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_xdp_adjust_head) + prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in @@ -648,8 +664,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_prog *prog = filp->private_data; + char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + + bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); + seq_printf(m, + "prog_type:\t%u\n" + "prog_jited:\t%u\n" + "prog_digest:\t%s\n" + "memlock:\t%llu\n", + prog->type, + prog->jited, + prog_digest, + prog->pages * 1ULL << PAGE_SHIFT); +} +#endif + static const struct file_operations bpf_prog_fops = { - .release = bpf_prog_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_prog_show_fdinfo, +#endif + .release = bpf_prog_release, }; int bpf_prog_new_fd(struct bpf_prog *prog) @@ -680,10 +718,22 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) } EXPORT_SYMBOL_GPL(bpf_prog_add); +void bpf_prog_sub(struct bpf_prog *prog, int i) +{ + /* Only to be used for undoing previous bpf_prog_add() in some + * error path. We still know that another entity in our call + * path holds a reference to the program, thus atomic_sub() can + * be safely used in such cases! + */ + WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +} +EXPORT_SYMBOL_GPL(bpf_prog_sub); + struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) { return bpf_prog_add(prog, 1); } +EXPORT_SYMBOL_GPL(bpf_prog_inc); static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { @@ -730,7 +780,7 @@ static int bpf_prog_load(union bpf_attr *attr) return -EINVAL; /* copy eBPF program license from user space */ - if (strncpy_from_user(license, u64_to_ptr(attr->license), + if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; @@ -738,8 +788,8 @@ static int bpf_prog_load(union bpf_attr *attr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt >= BPF_MAXINSNS) - return -EINVAL; + if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + return -E2BIG; if (type == BPF_PROG_TYPE_KPROBE && attr->kern_version != LINUX_VERSION_CODE) @@ -760,7 +810,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->len = attr->insn_cnt; err = -EFAULT; - if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), prog->len * sizeof(struct bpf_insn)) != 0) goto free_prog; @@ -811,7 +861,7 @@ static int bpf_obj_pin(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ)) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); + return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) @@ -819,9 +869,85 @@ static int bpf_obj_get(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) return -EINVAL; - return bpf_obj_get_user(u64_to_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); +} + +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + enum bpf_prog_type ptype; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; + default: + return -EINVAL; + } + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + + return 0; } +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -888,6 +1014,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,7 @@ #include <net/netlink.h> #include <linux/file.h> #include <linux/vmalloc.h> +#include <linux/stringify.h> /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. @@ -190,6 +191,22 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + static void print_verifier_state(struct bpf_verifier_state *state) { struct bpf_reg_state *reg; @@ -212,12 +229,13 @@ static void print_verifier_state(struct bpf_verifier_state *state) else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL || t == PTR_TO_MAP_VALUE_ADJ) - verbose("(ks=%d,vs=%d)", + verbose("(ks=%d,vs=%d,id=%u)", reg->map_ptr->key_size, - reg->map_ptr->value_size); + reg->map_ptr->value_size, + reg->id); if (reg->min_value != BPF_REGISTER_MIN_RANGE) - verbose(",min_value=%llu", - (unsigned long long)reg->min_value); + verbose(",min_value=%lld", + (long long)reg->min_value); if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); @@ -353,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn) u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %d\n", insn->code, insn->imm); + verbose("(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose("(%02x) goto pc%+d\n", insn->code, insn->off); @@ -447,6 +466,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; + regs[regno].id = 0; regs[regno].imm = 0; } @@ -613,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { switch (env->prog->type) { + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + /* dst_input() and dst_output() can't write for now */ + if (t == BPF_WRITE) + return false; case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_XMIT: if (meta) return meta->pkt_access; @@ -758,7 +785,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if ((s64)reg->min_value < 0) { + if (reg->min_value < 0) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -817,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -950,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + if (type == PTR_TO_PACKET && + !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; } @@ -1112,8 +1140,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); + verbose("cannot pass map_type %d into func %s#%d\n", + map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1170,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %d\n", func_id); + verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1178,7 +1206,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %d\n", func_id); + verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1188,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } - changes_data = bpf_helper_changes_skb_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(fn->func); memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; @@ -1198,7 +1226,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %d\n", func_id); + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(func_id), func_id); return err; } @@ -1252,9 +1281,10 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].id = ++env->id_gen; } else { - verbose("unknown return type %d of func %d\n", - fn->ret_type, func_id); + verbose("unknown return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1451,14 +1481,19 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); - /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. - * Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' + * insn. Don't care about overflow or negative values, just add them */ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) dst_reg->imm += insn->imm; else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && src_reg->type == CONST_IMM) dst_reg->imm += src_reg->imm; + else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm |= insn->imm; + else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm |= src_reg->imm; else mark_reg_unknown_value(regs, insn->dst_reg); return 0; @@ -1468,7 +1503,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg) { if (reg->max_value > BPF_REGISTER_MAX_RANGE) reg->max_value = BPF_REGISTER_MAX_RANGE; - if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + if (reg->min_value < BPF_REGISTER_MIN_RANGE || + reg->min_value > BPF_REGISTER_MAX_RANGE) reg->min_value = BPF_REGISTER_MIN_RANGE; } @@ -1476,8 +1512,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; - u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; - bool min_set = false, max_set = false; + s64 min_val = BPF_REGISTER_MIN_RANGE; + u64 max_val = BPF_REGISTER_MAX_RANGE; u8 opcode = BPF_OP(insn->code); dst_reg = ®s[insn->dst_reg]; @@ -1500,7 +1536,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, } else if (insn->imm < BPF_REGISTER_MAX_RANGE && (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { min_val = max_val = insn->imm; - min_set = max_set = true; } /* We don't know anything about what was done to this register, mark it @@ -1512,22 +1547,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, return; } + /* If one of our values was at the end of our ranges then we can't just + * do our normal operations to the register, we need to set the values + * to the min/max since they are undefined. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + switch (opcode) { case BPF_ADD: - dst_reg->min_value += min_val; - dst_reg->max_value += max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value += min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value += max_val; break; case BPF_SUB: - dst_reg->min_value -= min_val; - dst_reg->max_value -= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value -= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value -= max_val; break; case BPF_MUL: - dst_reg->min_value *= min_val; - dst_reg->max_value *= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value *= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value *= max_val; break; case BPF_AND: - /* & is special since it could end up with 0 bits set. */ - dst_reg->min_value &= min_val; + /* Disallow AND'ing of negative numbers, ain't nobody got time + * for that. Otherwise the minimum is 0 and the max is the max + * value we could AND against. + */ + if (min_val < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = 0; dst_reg->max_value = max_val; break; case BPF_LSH: @@ -1537,24 +1593,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, */ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value <<= min_val; if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - else + else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value <<= max_val; break; case BPF_RSH: - dst_reg->min_value >>= min_val; - dst_reg->max_value >>= max_val; - break; - case BPF_MOD: - /* % is special since it is an unsigned modulus, so the floor - * will always be 0. + /* RSH by a negative number is undefined, and the BPF_RSH is an + * unsigned shift, so make the appropriate casts. */ - dst_reg->min_value = 0; - dst_reg->max_value = max_val - 1; + if (min_val < 0 || dst_reg->min_value < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = + (u64)(dst_reg->min_value) >> min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value >>= max_val; break; default: reset_reg_range_values(regs, insn->dst_reg); @@ -1644,8 +1701,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } - regs[insn->dst_reg].type = UNKNOWN_VALUE; - regs[insn->dst_reg].map_ptr = NULL; + mark_reg_unknown_value(regs, insn->dst_reg); } } else { /* case: R = imm @@ -1907,6 +1963,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(true_reg); } +static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, + enum bpf_reg_type type) +{ + struct bpf_reg_state *reg = ®s[regno]; + + if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + reg->type = type; + if (type == UNKNOWN_VALUE) + mark_reg_unknown_value(regs, regno); + } +} + +/* The logic is similar to find_good_pkt_pointers(), both could eventually + * be folded together at some point. + */ +static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, + enum bpf_reg_type type) +{ + struct bpf_reg_state *regs = state->regs; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_map_reg(regs, i, regs[regno].id, type); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, + regs[regno].id, type); + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -1994,18 +2082,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (opcode == BPF_JEQ) { - /* next fallthrough insn can access memory via - * this register - */ - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - /* branch targer cannot access it, since reg == 0 */ - mark_reg_unknown_value(other_branch->regs, - insn->dst_reg); - } else { - other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - mark_reg_unknown_value(regs, insn->dst_reg); - } + /* Mark all identical map registers in each branch as either + * safe or unknown depending R == 0 or R != 0 conditional. + */ + mark_map_regs(this_branch, insn->dst_reg, + opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); + mark_map_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { @@ -2430,6 +2513,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { + bool varlen_map_access = env->varlen_map_value_access; struct bpf_reg_state *rold, *rcur; int i; @@ -2443,12 +2527,17 @@ static bool states_equal(struct bpf_verifier_env *env, /* If the ranges were not the same, but everything else was and * we didn't do a variable access into a map then we are a-ok. */ - if (!env->varlen_map_value_access && - rold->type == rcur->type && rold->imm == rcur->imm) + if (!varlen_map_access && + memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0) continue; + /* If we didn't map access then again we don't care about the + * mismatched range values and it's ok if our old type was + * UNKNOWN and we didn't go to a NOT_INIT'ed reg. + */ if (rold->type == NOT_INIT || - (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + (!varlen_map_access && rold->type == UNKNOWN_VALUE && + rcur->type != NOT_INIT)) continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && @@ -3044,9 +3133,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) struct bpf_verifier_env *env; int ret = -EINVAL; - if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) - return -E2BIG; - /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ @@ -3087,6 +3173,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } + bpf_prog_calc_digest(env->prog); + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 85bc9beb046d..2ee9ec3051b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); + if (parent) + cgroup_bpf_inherit(cgrp, parent); + cgroup_propagate_control(cgrp); /* @cgrp doesn't have dir yet so the following will only create csses */ @@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void) } subsys_initcall(cgroup_namespaces_init); +#ifdef CONFIG_CGROUP_BPF +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + mutex_lock(&cgroup_mutex); + __cgroup_bpf_update(cgrp, parent, prog, type); + mutex_unlock(&cgroup_mutex); +} +#endif /* CONFIG_CGROUP_BPF */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cpu.c b/kernel/cpu.c index 29de1a9352c0..217fd2e7f435 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -659,7 +659,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -#ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) @@ -676,6 +675,7 @@ void __unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(__unregister_cpu_notifier); +#ifdef CONFIG_HOTPLUG_CPU /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..faf073d0287f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,7 +902,15 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - cpuctx->cgrp = add ? event->cgrp : NULL; + + /* + * cpuctx->cgrp is NULL until a cgroup event is sched in or + * ctx->nr_cgroup == 0 . + */ + if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + else if (!add) + cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ @@ -1960,6 +1968,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7089,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -7709,7 +7723,7 @@ static void bpf_overflow_handler(struct perf_event *event, if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + ret = BPF_PROG_RUN(event->prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -8012,6 +8026,7 @@ restart: * if <size> is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -8035,6 +8050,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* @@ -8855,7 +8871,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8869,10 +8888,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/exit.c b/kernel/exit.c index 684de019b674..aacff8e2aec0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -847,6 +847,7 @@ void __noreturn do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* diff --git a/kernel/fork.c b/kernel/fork.c index 17da35fa77e7..5957cf8b4c4b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account) static void release_task_stack(struct task_struct *tsk) { + if (WARN_ON(tsk->state != TASK_DEAD)) + return; /* Better to leak the stack than to free prematurely */ + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); @@ -351,6 +354,8 @@ void free_task(struct task_struct *tsk) ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); arch_release_task_struct(tsk); + if (tsk->flags & PF_KTHREAD) + free_kthread_struct(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1550,7 +1555,9 @@ static __latent_entropy struct task_struct *copy_process( init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; +#endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -1864,6 +1871,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + p->state = TASK_DEAD; put_task_stack(p); free_task(p); fork_out: diff --git a/kernel/futex.c b/kernel/futex.c index 2c4be467fecd..9246d9f593d1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1298,7 +1298,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); bool deboost; int ret = 0; @@ -1415,7 +1415,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; int ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (!bitset) return -EINVAL; @@ -1469,7 +1469,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); @@ -1708,7 +1708,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (requeue_pi) { /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9c4d30483264..6b669593e7eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; - unsigned int omsk = irq_settings_get_trigger_mask(desc); + unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + irq, omsk, nmsk); } *old_ptr = new; diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..3cbb0c879705 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/printk.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> @@ -53,8 +54,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/kthread.c b/kernel/kthread.c index be2cc1f9dd57..956495f0efaf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -53,20 +53,29 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define __to_kthread(vfork) \ - container_of(vfork, struct kthread, exited) +static inline void set_kthread_struct(void *kthread) +{ + /* + * We abuse ->set_child_tid to avoid the new member and because it + * can't be wrongly copied by copy_process(). We also rely on fact + * that the caller can't exec, so PF_KTHREAD can't be cleared. + */ + current->set_child_tid = (__force void __user *)kthread; +} static inline struct kthread *to_kthread(struct task_struct *k) { - return __to_kthread(k->vfork_done); + WARN_ON(!(k->flags & PF_KTHREAD)); + return (__force void *)k->set_child_tid; } -static struct kthread *to_live_kthread(struct task_struct *k) +void free_kthread_struct(struct task_struct *k) { - struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork) && try_get_task_stack(k)) - return __to_kthread(vfork); - return NULL; + /* + * Can be NULL if this kthread was created by kernel_thread() + * or if kmalloc() in kthread() failed. + */ + kfree(to_kthread(k)); } /** @@ -181,14 +190,11 @@ static int kthread(void *_create) int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; - struct kthread self; + struct kthread *self; int ret; - self.flags = 0; - self.data = data; - init_completion(&self.exited); - init_completion(&self.parked); - current->vfork_done = &self.exited; + self = kmalloc(sizeof(*self), GFP_KERNEL); + set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ done = xchg(&create->done, NULL); @@ -196,6 +202,19 @@ static int kthread(void *_create) kfree(create); do_exit(-EINTR); } + + if (!self) { + create->result = ERR_PTR(-ENOMEM); + complete(done); + do_exit(-ENOMEM); + } + + self->flags = 0; + self->data = data; + init_completion(&self->exited); + init_completion(&self->parked); + current->vfork_done = &self->exited; + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -203,12 +222,10 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - - if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { - __kthread_parkme(&self); + if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { + __kthread_parkme(self); ret = threadfn(data); } - /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } @@ -409,8 +426,18 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) { + struct kthread *kthread = to_kthread(k); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * We clear the IS_PARKED bit here as we don't wait @@ -428,24 +455,6 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) wake_up_state(k, TASK_PARKED); } } - -/** - * kthread_unpark - unpark a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * - * Sets kthread_should_park() for @k to return false, wakes it, and - * waits for it to return. If the thread is marked percpu then its - * bound to the cpu again. - */ -void kthread_unpark(struct task_struct *k) -{ - struct kthread *kthread = to_live_kthread(k); - - if (kthread) { - __kthread_unpark(k, kthread); - put_task_stack(k); - } -} EXPORT_SYMBOL_GPL(kthread_unpark); /** @@ -462,21 +471,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark); */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = to_live_kthread(k); - int ret = -ENOSYS; - - if (kthread) { - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - if (k != current) { - wake_up_process(k); - wait_for_completion(&kthread->parked); - } + struct kthread *kthread = to_kthread(k); + + if (WARN_ON(k->flags & PF_EXITING)) + return -ENOSYS; + + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); } - put_task_stack(k); - ret = 0; } - return ret; + + return 0; } EXPORT_SYMBOL_GPL(kthread_park); @@ -503,14 +511,11 @@ int kthread_stop(struct task_struct *k) trace_sched_kthread_stop(k); get_task_struct(k); - kthread = to_live_kthread(k); - if (kthread) { - set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - __kthread_unpark(k, kthread); - wake_up_process(k); - wait_for_completion(&kthread->exited); - put_task_stack(k); - } + kthread = to_kthread(k); + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + kthread_unpark(k); + wake_up_process(k); + wait_for_completion(&kthread->exited); ret = k->exit_code; put_task_struct(k); @@ -636,6 +641,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; + int node = -1; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -643,25 +649,17 @@ __kthread_create_worker(int cpu, unsigned int flags, kthread_init_worker(worker); - if (cpu >= 0) { - char name[TASK_COMM_LEN]; - - /* - * kthread_create_worker_on_cpu() allows to pass a generic - * namefmt in compare with kthread_create_on_cpu. We need - * to format it here. - */ - vsnprintf(name, sizeof(name), namefmt, args); - task = kthread_create_on_cpu(kthread_worker_fn, worker, - cpu, name); - } else { - task = __kthread_create_on_node(kthread_worker_fn, worker, - -1, namefmt, args); - } + if (cpu >= 0) + node = cpu_to_node(cpu); + task = __kthread_create_on_node(kthread_worker_fn, worker, + node, namefmt, args); if (IS_ERR(task)) goto fail_task; + if (cpu >= 0) + kthread_bind(task, cpu); + worker->flags = flags; worker->task = task; wake_up_process(task); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..7bd265f6b098 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class) name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk("%s", name); + printk(KERN_CONT "%s", name); } else { - printk("%s", name); + printk(KERN_CONT "%s", name); if (class->name_version > 1) - printk("#%d", class->name_version); + printk(KERN_CONT "#%d", class->name_version); if (class->subclass) - printk("/%d", class->subclass); + printk(KERN_CONT "/%d", class->subclass); } } @@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class) get_usage_chars(class, usage); - printk(" ("); + printk(KERN_CONT " ("); __print_lock_name(class); - printk("){%s}", usage); + printk(KERN_CONT "){%s}", usage); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock) if (!name) name = __get_key_name(lock->key->subkeys, str); - printk("%s", name); + printk(KERN_CONT "%s", name); } static void print_lock(struct held_lock *hlock) @@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock) barrier(); if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { - printk("<RELEASED>\n"); + printk(KERN_CONT "<RELEASED>\n"); return; } print_lock_name(lock_classes + class_idx - 1); - printk(", at: "); - print_ip_sym(hlock->acquire_ip); + printk(KERN_CONT ", at: [<%p>] %pS\n", + (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) @@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); + printk(KERN_CONT "#%d", class->name_version); + printk(KERN_CONT "\n"); dump_stack(); if (!graph_lock()) { @@ -840,9 +840,9 @@ static struct lock_list *alloc_list_entry(void) /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, - int distance, struct stack_trace *trace) +static int add_lock_to_list(struct lock_class *this, struct list_head *head, + unsigned long ip, int distance, + struct stack_trace *trace) { struct lock_list *entry; /* @@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) return 0; printk("\n-> #%u", depth); print_lock_name(target->class); - printk(":\n"); + printk(KERN_CONT ":\n"); print_stack_trace(&target->trace, 6); return 0; @@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src, if (parent != source) { printk("Chain exists of:\n "); __print_lock_name(source); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(parent); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(target); - printk("\n\n"); + printk(KERN_CONT "\n\n"); } printk(" Possible unsafe locking scenario:\n\n"); @@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src, printk(" ---- ----\n"); printk(" lock("); __print_lock_name(target); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(parent); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(target); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(source); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); + printk(KERN_CONT " ops: %lu", class->ops); + printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); + len += printk(KERN_CONT " at:\n"); print_stack_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); + printk("%*s ... key at: [<%p>] %pS\n", + depth, "", class->key, class->key); } /* @@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, if (middle_class != unsafe_class) { printk("Chain exists of:\n "); __print_lock_name(safe_class); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(middle_class); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(unsafe_class); - printk("\n\n"); + printk(KERN_CONT "\n\n"); } printk(" Possible interrupt unsafe locking scenario:\n\n"); @@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" ---- ----\n"); printk(" lock("); __print_lock_name(unsafe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); __print_lock_name(safe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(middle_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); __print_lock_name(safe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock(prev); printk("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(" ->"); + printk(KERN_CONT " ->"); print_lock_name(hlock_class(next)); - printk("\n"); + printk(KERN_CONT "\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); @@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock", irqclass); - printk(" and the holding lock:\n"); + printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); @@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" ----\n"); printk(" lock("); __print_lock_name(prev); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(next); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); } @@ -1869,14 +1868,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, next->acquire_ip, distance, &trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, next->acquire_ip, distance, &trace); if (!ret) @@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); - printk(" => "); + printk(KERN_CONT " => "); print_lock_name(hlock_class(next)); - printk("\n"); + printk(KERN_CONT "\n"); dump_stack(); return graph_lock(); } @@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock) printk(" ----\n"); printk(" lock("); __print_lock_name(class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); __print_lock_name(class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); - print_ip_sym(curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); - print_ip_sym(curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); - print_ip_sym(curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); - print_ip_sym(curr->softirq_disable_ip); + printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, + (void *)curr->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, + (void *)curr->hardirq_disable_ip); + printk("softirqs last enabled at (%u): [<%p>] %pS\n", + curr->softirq_enable_event, (void *)curr->softirq_enable_ip, + (void *)curr->softirq_enable_ip); + printk("softirqs last disabled at (%u): [<%p>] %pS\n", + curr->softirq_disable_event, (void *)curr->softirq_disable_ip, + (void *)curr->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (very_verbose(class)) { printk("\nacquire class [%p] %s", class->key, class->name); if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); + printk(KERN_CONT "#%d", class->name_version); + printk(KERN_CONT "\n"); dump_stack(); } @@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(") at:\n"); + printk(KERN_CONT ") at:\n"); print_ip_sym(ip); printk("but there are no more locks to release!\n"); printk("\nother info that might help us debug this:\n"); @@ -3871,7 +3874,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(") at:\n"); + printk(KERN_CONT ") at:\n"); print_ip_sym(ip); printk("but there are no locks held!\n"); printk("\nother info that might help us debug this:\n"); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 51c4b24b6328..c2b88490d857 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,6 +46,14 @@ enum { (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) /* + * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * .data and .bss to fit in required 32MB limit for the kernel. With + * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * So, reduce the static allocations for lockdeps related structures so that + * everything fits in current required size limit. + */ +#ifdef CONFIG_PROVE_LOCKING_SMALL +/* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. * @@ -54,18 +62,24 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ +#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_STACK_TRACE_ENTRIES 262144UL +#else #define MAX_LOCKDEP_ENTRIES 32768UL #define MAX_LOCKDEP_CHAINS_BITS 16 -#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#endif + +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c835270f0c2f..6a385aabcce7 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -28,7 +28,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } while (0) #endif @@ -108,7 +108,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) - cpu_relax_lowlatency(); + cpu_relax(); } /* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9c951fade415..9aa713629387 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -73,21 +73,8 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } - - /* - * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug - * mutexes so that we can do it here after we've verified state. - */ - mutex_clear_owner(lock); - atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 57a871ae3c81..a459faa48987 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -27,16 +27,6 @@ extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} - #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a70b90db3909..9b349619f431 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -27,41 +27,176 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" -# include <asm-generic/mutex-null.h> -/* - * Must be 0 for the debug case so we do not do the unlock outside of the - * wait_lock region. debug_mutex_unlock() will do the actual unlock in this - * case. - */ -# undef __mutex_slowpath_needs_to_unlock -# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" -# include <asm/mutex.h> #endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - atomic_set(&lock->count, 1); + atomic_long_set(&lock->owner, 0); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); - mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); } - EXPORT_SYMBOL(__mutex_init); +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low + * bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter + */ +#define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 + +#define MUTEX_FLAGS 0x03 + +static inline struct task_struct *__owner_task(unsigned long owner) +{ + return (struct task_struct *)(owner & ~MUTEX_FLAGS); +} + +static inline unsigned long __owner_flags(unsigned long owner) +{ + return owner & MUTEX_FLAGS; +} + +/* + * Actual trylock that will work on any unlocked state. + * + * When setting the owner field, we must preserve the low flag bits. + * + * Be careful with @handoff, only set that in a wait-loop (where you set + * HANDOFF) to avoid recursive lock attempts. + */ +static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) +{ + unsigned long owner, curr = (unsigned long)current; + + owner = atomic_long_read(&lock->owner); + for (;;) { /* must loop, can race against a flag */ + unsigned long old, flags = __owner_flags(owner); + + if (__owner_task(owner)) { + if (handoff && unlikely(__owner_task(owner) == current)) { + /* + * Provide ACQUIRE semantics for the lock-handoff. + * + * We cannot easily use load-acquire here, since + * the actual load is a failed cmpxchg, which + * doesn't imply any barriers. + * + * Also, this is a fairly unlikely scenario, and + * this contains the cost. + */ + smp_mb(); /* ACQUIRE */ + return true; + } + + return false; + } + + /* + * We set the HANDOFF bit, we must make sure it doesn't live + * past the point where we acquire it. This would be possible + * if we (accidentally) set the bit on an unlocked mutex. + */ + if (handoff) + flags &= ~MUTEX_FLAG_HANDOFF; + + old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); + if (old == owner) + return true; + + owner = old; + } +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Lockdep annotations are contained to the slow paths for simplicity. + * There is nothing that would stop spreading the lockdep annotations outwards + * except more code. + */ + +/* + * Optimistic trylock that only works in the uncontended case. Make sure to + * follow with a __mutex_trylock() before failing. + */ +static __always_inline bool __mutex_trylock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + return true; + + return false; +} + +static __always_inline bool __mutex_unlock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) + return true; + + return false; +} +#endif + +static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_or(flag, &lock->owner); +} + +static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_andnot(flag, &lock->owner); +} + +static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) +{ + return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; +} + +/* + * Give up ownership to a specific task, when @task = NULL, this is equivalent + * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE + * semantics like a regular unlock, the __mutex_trylock() provides matching + * ACQUIRE semantics for the handoff. + */ +static void __mutex_handoff(struct mutex *lock, struct task_struct *task) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + for (;;) { + unsigned long old, new; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + new = (owner & MUTEX_FLAG_WAITERS); + new |= (unsigned long)task; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, new); + if (old == owner) + break; + + owner = old; + } +} + #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * We split the mutex lock/unlock logic into separate fastpath and @@ -69,7 +204,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); +static void __sched __mutex_lock_slowpath(struct mutex *lock); /** * mutex_lock - acquire the mutex @@ -95,14 +230,10 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); void __sched mutex_lock(struct mutex *lock) { might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); - mutex_set_owner(lock); -} + if (!__mutex_trylock_fast(lock)) + __mutex_lock_slowpath(lock); +} EXPORT_SYMBOL(mutex_lock); #endif @@ -149,9 +280,6 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, @@ -176,7 +304,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, /* * Check if lock is contended, if not there is nobody to wake up */ - if (likely(atomic_read(&lock->base.count) == 0)) + if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* @@ -227,7 +355,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) bool ret = true; rcu_read_lock(); - while (lock->owner == owner) { + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -236,12 +364,16 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ barrier(); - if (!owner->on_cpu || need_resched()) { + /* + * Use vcpu_is_preempted to detect lock holder preemption issue. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { ret = false; break; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); @@ -260,27 +392,25 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = READ_ONCE(lock->owner); + owner = __mutex_owner(lock); + + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ if (owner) - retval = owner->on_cpu; + retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); rcu_read_unlock(); + /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. + * If lock->owner is not set, the mutex has been released. Return true + * such that we'll trylock in the spin path, which is a faster option + * than the blocking slow path. */ return retval; } /* - * Atomically try to take the lock when it is available - */ -static inline bool mutex_try_to_acquire(struct mutex *lock) -{ - return !mutex_is_locked(lock) && - (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); -} - -/* * Optimistic spinning. * * We try to spin for acquisition when we find that the lock owner @@ -288,13 +418,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock) * need to reschedule. The rationale is that if the lock owner is * running, it is likely to release the lock soon. * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * * The mutex spinners are queued up using MCS lock so that only one * spinner can compete for the mutex. However, if mutex spinning isn't * going to happen, there is no point in going through the lock/unlock @@ -302,24 +425,39 @@ static inline bool mutex_try_to_acquire(struct mutex *lock) * * Returns true when the lock was taken, otherwise false, indicating * that we need to jump to the slowpath and sleep. + * + * The waiter flag is set to true if the spinner is a waiter in the wait + * queue. The waiter-spinner will spin on the lock directly and concurrently + * with the spinner at the head of the OSQ, if present, until the owner is + * changed to itself. */ static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { struct task_struct *task = current; - if (!mutex_can_spin_on_owner(lock)) - goto done; + if (!waiter) { + /* + * The purpose of the mutex_can_spin_on_owner() function is + * to eliminate the overhead of osq_lock() and osq_unlock() + * in case spinning isn't possible. As a waiter-spinner + * is not going to take OSQ lock anyway, there is no need + * to call mutex_can_spin_on_owner(). + */ + if (!mutex_can_spin_on_owner(lock)) + goto fail; - /* - * In order to avoid a stampede of mutex spinners trying to - * acquire the mutex all at once, the spinners need to take a - * MCS (queued) lock first before spinning on the owner field. - */ - if (!osq_lock(&lock->osq)) - goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ + if (!osq_lock(&lock->osq)) + goto fail; + } - while (true) { + for (;;) { struct task_struct *owner; if (use_ww_ctx && ww_ctx->acquired > 0) { @@ -335,40 +473,26 @@ static bool mutex_optimistic_spin(struct mutex *lock, * performed the optimistic spinning cannot be done. */ if (READ_ONCE(ww->ctx)) - break; + goto fail_unlock; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - owner = READ_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; - - /* Try to acquire the mutex if it is unlocked. */ - if (mutex_try_to_acquire(lock)) { - lock_acquired(&lock->dep_map, ip); - - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); + owner = __mutex_owner(lock); + if (owner) { + if (waiter && owner == task) { + smp_mb(); /* ACQUIRE */ + break; } - mutex_set_owner(lock); - osq_unlock(&lock->osq); - return true; + if (!mutex_spin_on_owner(lock, owner)) + goto fail_unlock; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) + /* Try to acquire the mutex if it is unlocked. */ + if (__mutex_trylock(lock, waiter)) break; /* @@ -377,11 +501,20 @@ static bool mutex_optimistic_spin(struct mutex *lock, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } - osq_unlock(&lock->osq); -done: + if (!waiter) + osq_unlock(&lock->osq); + + return true; + + +fail_unlock: + if (!waiter) + osq_unlock(&lock->osq); + +fail: /* * If we fell out of the spin path because of need_resched(), * reschedule now, before we try-lock the mutex. This avoids getting @@ -400,14 +533,14 @@ done: } #else static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { return false; } #endif -__visible __used noinline -void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); /** * mutex_unlock - release the mutex @@ -422,21 +555,12 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count); */ void __sched mutex_unlock(struct mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(lock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC + if (__mutex_unlock_fast(lock)) + return; #endif - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + __mutex_unlock_slowpath(lock, _RET_IP_); } - EXPORT_SYMBOL(mutex_unlock); /** @@ -465,15 +589,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) lock->ctx = NULL; } -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(&lock->base); -#endif - __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); + mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); @@ -509,10 +625,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + bool first = false; + struct ww_mutex *ww; int ret; if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + ww = container_of(lock, struct ww_mutex, base); if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -520,20 +638,21 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + if (__mutex_trylock(lock, false) || + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { /* got the lock, yay! */ + lock_acquired(&lock->dep_map, ip); + if (use_ww_ctx) + ww_mutex_set_context_fastpath(ww, ww_ctx); preempt_enable(); return 0; } spin_lock_mutex(&lock->wait_lock, flags); - /* - * Once more, try to acquire the lock. Only try-lock the mutex if - * it is unlocked to reduce unnecessary xchg() operations. + * After waiting to acquire the wait_lock, try again. */ - if (!mutex_is_locked(lock) && - (atomic_xchg_acquire(&lock->count, 0) == 1)) + if (__mutex_trylock(lock, false)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -543,26 +662,26 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + if (__mutex_waiter_is_first(lock, &waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + lock_contended(&lock->dep_map, ip); + set_task_state(task, state); for (;;) { /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters. We only attempt the xchg if the count is - * non-negative in order to avoid unnecessary xchg operations: + * Once we hold wait_lock, we're serialized against + * mutex_unlock() handing the lock off to us, do a trylock + * before testing the error conditions to make sure we pick up + * the handoff. */ - if (atomic_read(&lock->count) >= 0 && - (atomic_xchg_acquire(&lock->count, -1) == 1)) - break; + if (__mutex_trylock(lock, first)) + goto acquired; /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) + * Check for signals and wound conditions while holding + * wait_lock. This ensures the lock cancellation is ordered + * against mutex_unlock() and wake-ups do not go missing. */ if (unlikely(signal_pending_state(state, task))) { ret = -EINTR; @@ -575,36 +694,49 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - __set_task_state(task, state); - - /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); + + if (!first && __mutex_waiter_is_first(lock, &waiter)) { + first = true; + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + } + + set_task_state(task, state); + /* + * Here we order against unlock; we must either see it change + * state back to RUNNING and fall through the next schedule(), + * or we must see its unlock and acquire. + */ + if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || + __mutex_trylock(lock, first)) + break; + spin_lock_mutex(&lock->wait_lock, flags); } + spin_lock_mutex(&lock->wait_lock, flags); +acquired: __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); - /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); + __mutex_clear_flag(lock, MUTEX_FLAGS); + debug_mutex_free_waiter(&waiter); skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - mutex_set_owner(lock); - if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (use_ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - } spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable(); return 0; err: + __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -631,7 +763,6 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched @@ -650,7 +781,6 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); static inline int @@ -715,54 +845,64 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); /* * Release the lock, slowpath: */ -static inline void -__mutex_unlock_common_slowpath(struct mutex *lock, int nested) +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { - unsigned long flags; - WAKE_Q(wake_q); + struct task_struct *next = NULL; + unsigned long owner, flags; + DEFINE_WAKE_Q(wake_q); + + mutex_release(&lock->dep_map, 1, ip); /* - * As a performance measurement, release the lock before doing other - * wakeup related duties to follow. This allows other tasks to acquire - * the lock sooner, while still handling cleanups in past unlock calls. - * This can be done as we do not enforce strict equivalence between the - * mutex counter and wait_list. - * + * Release the lock before (potentially) taking the spinlock such that + * other contenders can get on with things ASAP. * - * Some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - as the lock counter is currently 0 or negative. + * Except when HANDOFF, in that case we must not clear the owner field, + * but instead set it to the top waiter. */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); + owner = atomic_long_read(&lock->owner); + for (;;) { + unsigned long old; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + if (owner & MUTEX_FLAG_HANDOFF) + break; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, + __owner_flags(owner)); + if (old == owner) { + if (owner & MUTEX_FLAG_WAITERS) + break; + + return; + } + + owner = old; + } spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); + list_first_entry(&lock->wait_list, + struct mutex_waiter, list); + + next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - wake_q_add(&wake_q, waiter->task); + wake_q_add(&wake_q, next); } - spin_unlock_mutex(&lock->wait_lock, flags); - wake_up_q(&wake_q); -} + if (owner & MUTEX_FLAG_HANDOFF) + __mutex_handoff(lock, next); -/* - * Release the lock, slowpath: - */ -__visible void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); + spin_unlock_mutex(&lock->wait_lock, flags); - __mutex_unlock_common_slowpath(lock, 1); + wake_up_q(&wake_q); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -789,38 +929,30 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock); */ int __sched mutex_lock_interruptible(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_interruptible_slowpath(lock); + + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); int __sched mutex_lock_killable(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_killable_slowpath(lock); + + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); -__visible void __sched -__mutex_lock_slowpath(atomic_t *lock_count) +static noinline void __sched +__mutex_lock_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_, NULL, 0); } @@ -856,37 +988,6 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - /* No need to trylock if the mutex is locked. */ - if (mutex_is_locked(lock)) - return 0; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg_acquire(&lock->count, -1); - if (likely(prev == 1)) { - mutex_set_owner(lock); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -903,13 +1004,12 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) */ int __sched mutex_trylock(struct mutex *lock) { - int ret; + bool locked = __mutex_trylock(lock, false); - ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); - if (ret) - mutex_set_owner(lock); + if (locked) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return ret; + return locked; } EXPORT_SYMBOL(mutex_trylock); @@ -917,36 +1017,28 @@ EXPORT_SYMBOL(mutex_trylock); int __sched __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock); int __sched __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock_interruptible); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6cd6b8e9efd7..4410a4af42a3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,32 +16,6 @@ #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * The mutex owner can get read and written to locklessly. - * We should use WRITE_ONCE when writing the owner value to - * avoid store tearing, otherwise, a thread could potentially - * read a partially written and incomplete owner value. - */ -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} -#else -static inline void mutex_set_owner(struct mutex *lock) -{ -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ -} -#endif - #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..a3167941093b 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr) return cpu_nr + 1; } +static inline int node_cpu(struct optimistic_spin_node *node) +{ + return node->cpu - 1; +} + static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) { int cpu_nr = encoded_cpu_val - 1; @@ -75,7 +80,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - cpu_relax_lowlatency(); + cpu_relax(); } return next; @@ -118,11 +123,13 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * Use vcpu_is_preempted() to avoid waiting for a preempted + * lock holder: */ - if (need_resched()) + if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) goto unqueue; - cpu_relax_lowlatency(); + cpu_relax(); } return true; @@ -148,7 +155,7 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - cpu_relax_lowlatency(); + cpu_relax(); /* * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 19248ddf37ce..cc3ed0ccdfa2 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -54,7 +54,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax_lowlatency(); + cpu_relax(); cnts = atomic_read_acquire(&lock->cnts); } } @@ -130,7 +130,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; - cpu_relax_lowlatency(); + cpu_relax(); } /* When no more readers, set the locked flag */ @@ -141,7 +141,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - cpu_relax_lowlatency(); + cpu_relax(); } unlock: arch_spin_unlock(&lock->wait_lock); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..2f443ed2320a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) static void fixup_rt_mutex_waiters(struct rt_mutex *lock) { - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); + unsigned long owner, *p = (unsigned long *) &lock->owner; + + if (rt_mutex_has_waiters(lock)) + return; + + /* + * The rbtree has no waiters enqueued, now make sure that the + * lock->owner still has the waiters bit set, otherwise the + * following can happen: + * + * CPU 0 CPU 1 CPU2 + * l->owner=T1 + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T2) + * boost() + * unlock(l->lock) + * block() + * + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T3) + * boost() + * unlock(l->lock) + * block() + * signal(->T2) signal(->T3) + * lock(l->lock) + * dequeue(T2) + * deboost() + * unlock(l->lock) + * lock(l->lock) + * dequeue(T3) + * ==> wait list is empty + * deboost() + * unlock(l->lock) + * lock(l->lock) + * fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * l->owner = owner + * owner = l->owner & ~HAS_WAITERS; + * ==> l->owner = T1 + * } + * lock(l->lock) + * rt_mutex_unlock(l) fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * owner = l->owner & ~HAS_WAITERS; + * cmpxchg(l->owner, T1, NULL) + * ===> Success (l->owner = NULL) + * + * l->owner = owner + * ==> l->owner = T1 + * } + * + * With the check for the waiter bit in place T3 on CPU2 will not + * overwrite. All tasks fiddling with the waiters bit are + * serialized by l->lock, so nothing else can modify the waiters + * bit. If the bit is set then nothing can change l->owner either + * so the simple RMW is safe. The cmpxchg() will simply fail if it + * happens in the middle of the RMW because the waiters bit is + * still set. + */ + owner = READ_ONCE(*p); + if (owner & RT_MUTEX_HAS_WAITERS) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); } /* @@ -1382,7 +1446,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, struct wake_q_head *wqh)) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); @@ -1555,11 +1619,15 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a * proxy owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @proxy_owner:the task to set as owner * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. */ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) @@ -1573,10 +1641,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, /** * rt_mutex_proxy_unlock - release a lock on behalf of owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This merrily cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. */ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 4f5f83c7d2d3..990134617b4c 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -71,12 +71,12 @@ task_top_pi_waiter(struct task_struct *p) * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL -#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2337b4bb2366..631506004f9e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -225,7 +225,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) goto done; } - ret = owner->on_cpu; + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ + ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); done: rcu_read_unlock(); return ret; @@ -362,13 +366,17 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) */ barrier(); - /* abort spinning when need_resched or owner is not running */ - if (!owner->on_cpu || need_resched()) { + /* + * abort spinning when need_resched or owner is not running or + * owner's cpu is preempted. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { rcu_read_unlock(); return false; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); out: @@ -423,7 +431,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } osq_unlock(&sem->osq); done: @@ -461,7 +469,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); @@ -495,7 +503,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* @@ -571,7 +579,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -625,7 +633,7 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); diff --git a/kernel/module.c b/kernel/module.c index f57dd63186e6..0e54d5bf0097 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1301,8 +1301,9 @@ static int check_version(Elf_Shdr *sechdrs, goto bad_version; } - pr_warn("%s: no symbol version for %s\n", mod->name, symname); - return 0; + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void) /* RTCs have initialized by now too ... can we use one? */ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); return 0; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index de08fc90baaf..577f2288d19f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,17 +253,6 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); -#ifdef CONFIG_OF -static bool of_specified_console; - -void console_set_by_of(void) -{ - of_specified_console = true; -} -#else -# define of_specified_console false -#endif - /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -794,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return ret; } -static void cont_flush(void); - static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -811,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -874,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return -ESPIPE; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); switch (whence) { case SEEK_SET: /* the first record */ @@ -913,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -1300,7 +1284,6 @@ static int syslog_print(char __user *buf, int size) size_t skip; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1360,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (buf) { u64 next_seq; u64 seq; @@ -1522,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -2185,27 +2166,20 @@ void resume_console(void) /** * console_cpu_notify - print deferred console messages after CPU hotplug - * @self: notifier struct - * @action: CPU hotplug event - * @hcpu: unused + * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be spooled but will not show up on the console. This function is * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int console_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - case CPU_DOWN_FAILED: - case CPU_UP_CANCELED: +static int console_cpu_notify(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) { console_lock(); console_unlock(); } - return NOTIFY_OK; + return 0; } /** @@ -2657,7 +2631,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0 && !of_specified_console) { + if (preferred_console < 0) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2843,6 +2817,7 @@ EXPORT_SYMBOL(unregister_console); static int __init printk_late_init(void) { struct console *con; + int ret; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { @@ -2857,7 +2832,12 @@ static int __init printk_late_init(void) unregister_console(con); } } - hotcpu_notifier(console_cpu_notify, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", + console_cpu_notify, NULL); + WARN_ON(ret < 0); return 0; } late_initcall(printk_late_init); @@ -3039,7 +3019,6 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->active = true; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; @@ -3130,7 +3109,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, bool ret; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -3173,7 +3151,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf08fee53dc7..87c51225ceec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct torture_random_state *rrsp) { + unsigned long started; + unsigned long completed; const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; + unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + started = cur_ops->completed(); + ts = rcu_trace_clock_local(); mdelay(longdelay_ms); + completed = cur_ops->completed(); + do_trace_rcu_torture_read(cur_ops->name, NULL, ts, + started, completed); + } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 69a5611a7e7c..96c52e43f7ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) if (!rcu_kick_kthreads) return; j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread) { + if (time_after(jiffies, j) && rsp->gp_kthread && + (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); @@ -2828,8 +2829,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * Also schedule RCU core processing. * * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). + * invoked from the scheduling-clock interrupt. */ void rcu_check_callbacks(int user) { @@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ + /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ WRITE_ONCE(head->func, rcu_leak_callback); @@ -3130,13 +3132,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, } head->func = func; head->next = NULL; - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e99a5234d9ed..fe98dd24adf8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -404,6 +404,7 @@ struct rcu_data { atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ + int exp_dynticks_snap; /* Double-check need for IPI. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 24343eb87b58..d3053e99fdb6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_dynticks_snap = + atomic_add_return(0, &rdtp->dynticks); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rdp->exp_dynticks_snap & 0x1) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + if (!(mask_ofl_ipi & mask)) continue; retry_ipi: + if (atomic_add_return(0, &rdtp->dynticks) != + rdp->exp_dynticks_snap) { + mask_ofl_test |= mask; + continue; + } ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ if (p->flags & PF_EXITING) return false; @@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -192,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; + unsigned long shares; int err; if (nice < MIN_NICE || nice > MAX_NICE) @@ -210,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); + shares = scale_load(sched_prio_to_weight[nice + 20]); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, shares); if (!err) ag->nice = nice; up_write(&ag->lock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..d18804491d9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,11 +75,11 @@ #include <linux/compiler.h> #include <linux/frame.h> #include <linux/prefetch.h> +#include <linux/mutex.h> #include <asm/switch_to.h> #include <asm/tlb.h> #include <asm/irq_regs.h> -#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. + * If (@state & @p->state) @p->state = TASK_RUNNING. * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. + * If the task was not queued/runnable, also place it back on a runqueue. + * + * Atomic against schedule() which would dequeue a task, also see + * set_current_state(). + * + * Return: %true if @p->state changes (an actual wakeup was done), + * %false otherwise. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -5192,21 +5193,14 @@ void sched_show_task(struct task_struct *p) int ppid; unsigned long state = p->state; + if (!try_get_task_stack(p)) + return; if (state) state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else if (state == TASK_RUNNING) printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -5221,6 +5215,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); show_stack(p, NULL); + put_task_stack(p); } void show_state_filter(unsigned long state_filter) @@ -5713,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6190,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the @@ -6307,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) WARN_ON(!sg); do { + int cpu, max_cpu = -1; + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + + if (!(sd->flags & SD_ASYM_PACKING)) + goto next; + + for_each_cpu(cpu, sched_group_cpus(sg)) { + if (max_cpu < 0) + max_cpu = cpu; + else if (sched_asym_prefer(cpu, max_cpu)) + max_cpu = cpu; + } + sg->asym_prefer_cpu = max_cpu; + +next: sg = sg->next; } while (sg != sd->groups); @@ -7515,11 +7526,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -7592,6 +7619,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bc0b309c3f19..9add206b5608 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[stat], - cputime64_to_clock_t(val[stat])); + (long long)cputime64_to_clock_t(val[stat])); } return 0; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5ebee3164e64..7700a9cba335 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index, * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +void account_user_time(struct task_struct *p, cputime_t cputime) { int index; /* Add user time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; @@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +static void account_guest_time(struct task_struct *p, cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); p->gtime += cputime; @@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, * Account system cpu time to a process and desired cpustat field * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated + * @index: pointer to cpustat field that has to be updated */ static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) +void __account_system_time(struct task_struct *p, cputime_t cputime, int index) { /* Add system time to process. */ p->stime += cputime; - p->stimescaled += cputime_scaled; account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) + cputime_t cputime) { int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); + account_guest_time(p, cputime); return; } @@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, cputime_scaled, index); + __account_system_time(p, cputime, index); } /* @@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { u64 cputime = (__force u64) cputime_one_jiffy * ticks; - cputime_t scaled, other; + cputime_t other; /* * When returning from idle, many ticks can get accounted at @@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, if (other >= cputime) return; cputime -= other; - scaled = cputime_to_scaled(cputime); if (this_cpu_ksoftirqd() == p) { /* @@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime, scaled); + account_user_time(p, cputime); } else if (p == rq->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime, scaled); + account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); + __account_system_time(p, cputime, CPUTIME_SYSTEM); } } @@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t cputime, scaled, steal; + cputime_t cputime, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick) return; cputime -= steal; - scaled = cputime_to_scaled(cputime); if (user_tick) - account_user_time(p, cputime, scaled); + account_user_time(p, cputime); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); } @@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk) { cputime_t delta_cpu = get_vtime_delta(tsk); - account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); + account_system_time(tsk, irq_count(), delta_cpu); } void vtime_account_system(struct task_struct *tsk) @@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk) tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) { delta_cpu = get_vtime_delta(tsk); - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + account_user_time(tsk, delta_cpu); } write_seqcount_end(&tsk->vtime_seqcount); } @@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -static void -fetch_task_cputime(struct task_struct *t, - cputime_t *u_dst, cputime_t *s_dst, - cputime_t *u_src, cputime_t *s_src, - cputime_t *udelta, cputime_t *sdelta) +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { + cputime_t delta; unsigned int seq; - unsigned long long delta; - do { - *udelta = 0; - *sdelta = 0; + if (!vtime_accounting_enabled()) { + *utime = t->utime; + *stime = t->stime; + return; + } + do { seq = read_seqcount_begin(&t->vtime_seqcount); - if (u_dst) - *u_dst = *u_src; - if (s_dst) - *s_dst = *s_src; + *utime = t->utime; + *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || - is_idle_task(t)) + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t, * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { - *udelta = delta; - } else { - if (t->vtime_snap_whence == VTIME_SYS) - *sdelta = delta; - } + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + *utime += delta; + else if (t->vtime_snap_whence == VTIME_SYS) + *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } - - -void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; - return; - } - - fetch_task_cputime(t, utime, stime, &t->utime, - &t->stime, &udelta, &sdelta); - if (utime) - *utime += udelta; - if (stime) - *stime += sdelta; -} - -void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; - return; - } - - fetch_task_cputime(t, utimescaled, stimescaled, - &t->utimescaled, &t->stimescaled, &udelta, &sdelta); - if (utimescaled) - *utimescaled += cputime_to_scaled(udelta); - if (stimescaled) - *stimescaled += cputime_to_scaled(sdelta); -} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 37e2449186c4..70ef2b1901e4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) /* * The task might have changed its scheduling policy to something - * different than SCHED_DEADLINE (through switched_fromd_dl()). + * different than SCHED_DEADLINE (through switched_from_dl()). */ if (!dl_task(p)) { __dl_clear_params(p); @@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo pull_dl_task(rq); lockdep_repin_lock(&rq->lock, cookie); /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this + * pull_dl_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to * re-start task selection. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d941c97dfbc3..6559d197e08a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,7 +37,6 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -46,31 +45,35 @@ * * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) * * Options are: - * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) - * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling - = SCHED_TUNABLESCALING_LOG; +enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: + * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* - * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ static unsigned int sched_nr_latency = 8; @@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; /* * SCHED_OTHER wake-up granularity. - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SMP /* - * The exponential sliding window over which load is averaged for shares - * distribution. - * (default: 10msec) + * For asym packing, by default the lower numbered cpu has higher priority. */ -unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; +int __weak arch_asym_cpu_priority(int cpu) +{ + return -cpu; +} +#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; * to consumption or the quota being specified to be smaller than the slice) * we will always only issue the remaining available time. * - * default: 5 msec, units: microseconds - */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + * (default: 5 msec, units: microseconds) + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif /* * The margin used when comparing utilization with CPU capacity: - * util * 1024 < capacity * margin + * util * margin < capacity * 1024 + * + * (default: ~20%) */ -unsigned int capacity_margin = 1280; /* ~20% */ +unsigned int capacity_margin = 1280; static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; - u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se) * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = now; + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); return; } } - update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); } #else /* !CONFIG_SMP */ @@ -2890,6 +2934,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -2969,8 +3033,138 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } } + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) @@ -3041,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3048,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -3064,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return decayed || removed_load; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); + int decayed; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); + + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); } @@ -3094,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - * - * Or we're fresh through post_init_entity_util_avg(). - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3133,14 +3322,12 @@ skip_aging: */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3150,34 +3337,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3206,13 +3379,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) #endif /* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3224,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se) * calls this. */ - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -3251,7 +3434,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return 0; } -static inline void update_load_avg(struct sched_entity *se, int not_used) +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1) { cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); } @@ -3396,6 +3582,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3470,6 +3657,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3557,7 +3745,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3675,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4572,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4631,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -5199,6 +5387,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return 1; } +static inline int task_util(struct task_struct *p); +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5208,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; + struct sched_group *most_spare_sg = NULL; + unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, runnable_load; + unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -5228,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + runnable_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5238,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, else load = target_load(i, load_idx); - avg_load += load; + runnable_load += load; + + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; if (local_group) { - this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_runnable_load = runnable_load; + this_avg_load = avg_load; + this_spare = max_spare_cap; + } else { + if (min_runnable_load > (runnable_load + imbalance)) { + /* + * The runnable load is significantly smaller + * so we can pick this new cpu + */ + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + /* + * The runnable loads are close so take the + * blocked load into account through avg_load. + */ + min_avg_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); - if (!idlest || 100*this_load < imbalance*min_load) + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. + */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + + if (this_spare > task_util(p) / 2 && + imbalance_scale*this_spare > 100*most_spare) return NULL; + + if (most_spare > task_util(p) / 2) + return most_spare_sg; + +skip_spare: + if (!idlest) + return NULL; + + if (min_runnable_load > (this_runnable_load + imbalance)) + return NULL; + + if ((this_runnable_load < (min_runnable_load + imbalance)) && + (100*this_avg_load < imbalance_scale*min_avg_load)) + return NULL; + return idlest; } @@ -5590,6 +5859,24 @@ static inline int task_util(struct task_struct *p) } /* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + +/* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. * @@ -5607,6 +5894,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) if (max_cap - min_cap < max_cap >> 3) return 0; + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + return min_cap * 1024 < task_util(p) * capacity_margin; } @@ -6641,6 +6931,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6845,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6864,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -6888,11 +7184,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + min_capacity = min(capacity, min_capacity); } } else { /* @@ -6902,12 +7199,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -6930,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * cpumask covering 1 cpu of the first group and 3 cpus of the second group. * Something like: * - * { 0 1 2 3 } { 4 5 6 7 } - * * * * * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload @@ -7002,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->min_capacity * capacity_margin < + ref->sgc->min_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7105,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -7113,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (env->idle == CPU_NOT_IDLE) return true; /* - * ASYM_PACKING needs to move all the work to the lowest - * numbered CPUs in the group, therefore mark all groups - * higher than ourself as busy. + * ASYM_PACKING needs to move all the work to the highest + * prority CPUs in the group, therefore mark all groups + * of lower priority than ourself as busy. */ - if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && + sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { if (!sds->busiest) return true; - /* Prefer to move from highest possible cpu's work */ - if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) + /* Prefer to move from lowest priority cpu's work */ + if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, + sg->asym_prefer_cpu)) return true; } @@ -7274,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!sds->busiest) return 0; - busiest_cpu = group_first_cpu(sds->busiest); - if (env->dst_cpu > busiest_cpu) + busiest_cpu = sds->busiest->asym_prefer_cpu; + if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; env->imbalance = DIV_ROUND_CLOSEST( @@ -7613,10 +7941,11 @@ static int need_active_balance(struct lb_env *env) /* * ASYM_PACKING needs to force migrate tasks from busy but - * higher numbered CPUs in order to pack all tasks in the - * lowest numbered CPUs. + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. */ - if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) + if ((sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu)) return 1; } @@ -8465,7 +8794,7 @@ static inline bool nohz_kick_needed(struct rq *rq) unsigned long now = jiffies; struct sched_domain_shared *sds; struct sched_domain *sd; - int nr_busy, cpu = rq->cpu; + int nr_busy, i, cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -8516,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq) } sd = rcu_dereference(per_cpu(sd_asym, cpu)); - if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) { - kick = true; - goto unlock; - } + if (sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (i == cpu || + !cpumask_test_cpu(i, nohz.idle_cpus_mask)) + continue; + if (sched_asym_prefer(i, cpu)) { + kick = true; + goto unlock; + } + } + } unlock: rcu_read_unlock(); return kick; @@ -8687,32 +9022,45 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } -static void detach_task_cfs_rq(struct task_struct *p) +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); + struct cfs_rq *cfs_rq; - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } -static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8722,10 +9070,36 @@ static void attach_task_cfs_rq(struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - /* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8779,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif @@ -8839,7 +9216,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8854,8 +9230,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8890,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 055f935d4421..7b34c7826ca5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,6 +404,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT @@ -539,6 +540,11 @@ struct dl_rq { #ifdef CONFIG_SMP +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -623,6 +629,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -892,7 +899,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ @@ -905,6 +913,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* cpu of highest priority in group */ /* * The CPUs this group covers. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0db7c8a2afe2..bff9c774987a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; diff --git a/kernel/smp.c b/kernel/smp.c index bba3b201668d..77fcdb9f2775 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,9 @@ * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> @@ -543,19 +546,17 @@ void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -void __weak smp_announce(void) -{ - printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); -} - /* Called by boot processor to activate the rest. */ void __init smp_init(void) { + int num_nodes, num_cpus; unsigned int cpu; idle_threads_init(); cpuhp_threads_init(); + pr_info("Bringing up secondary CPUs ...\n"); + /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) @@ -564,8 +565,13 @@ void __init smp_init(void) cpu_up(cpu); } + num_nodes = num_online_nodes(); + num_cpus = num_online_cpus(); + pr_info("Brought up %d node%s, %d CPU%s\n", + num_nodes, (num_nodes > 1 ? "s" : ""), + num_cpus, (num_cpus > 1 ? "s" : "")); + /* Any cleanup work */ - smp_announce(); smp_cpus_done(setup_max_cpus); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 1bf81ef91375..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ec9ab2f01489..1eb82661ecdb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data) /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax(); + cpu_relax_yield(); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..39b3368f6de6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_shares_window_ns", - .data = &sysctl_sched_shares_window, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", @@ -990,13 +983,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "io_delay_type", .data = &io_delay_type, .maxlen = sizeof(int), diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..8a5e44236f78 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -41,12 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum); static int family_registered; struct kmem_cache *taskstats_cache; -static struct genl_family family = { - .id = GENL_ID_GENERATE, - .name = TASKSTATS_GENL_NAME, - .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, -}; +static struct genl_family family; static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, @@ -54,7 +49,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; @@ -651,6 +650,15 @@ static const struct genl_ops taskstats_ops[] = { }, }; +static struct genl_family family __ro_after_init = { + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, + .module = THIS_MODULE, + .ops = taskstats_ops, + .n_ops = ARRAY_SIZE(taskstats_ops), +}; + /* Needed early in initialization */ void __init taskstats_init_early(void) { @@ -667,7 +675,7 @@ static int __init taskstats_init(void) { int rc; - rc = genl_register_family_with_ops(&family, taskstats_ops); + rc = genl_register_family(&family); if (rc) return rc; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7e4fad75acaa..150242ccfcd2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) *mult = tmp; *shift = sft; } +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]--------- * curr_clocksource: diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e582f20f47a4..f246763c9947 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -132,9 +132,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p) } static inline unsigned long long virt_ticks(struct task_struct *p) { - cputime_t utime; + cputime_t utime, stime; - task_cputime(p, &utime, NULL); + task_cputime(p, &utime, &stime); return cputime_to_expires(utime); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3bcb61b52f6c..71496a20e670 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -390,24 +390,16 @@ static int __init tick_nohz_full_setup(char *str) } __setup("nohz_full=", tick_nohz_full_setup); -static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int tick_nohz_cpu_down(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks - * CPUs. It must remain online when nohz full is enabled. - */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return NOTIFY_BAD; - break; - } - return NOTIFY_OK; + /* + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return -EBUSY; + return 0; } static int tick_nohz_init_all(void) @@ -428,7 +420,7 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { - int cpu; + int cpu, ret; if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) @@ -469,7 +461,10 @@ void __init tick_nohz_init(void) for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); - cpu_notifier(tick_nohz_cpu_down_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "kernel/nohz:predown", NULL, + tick_nohz_cpu_down); + WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e2892e454fe3..ea4fbf8477a9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); @@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); @@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5dcb99281259..fa77311dadb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -422,6 +422,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2050a7652a86..da87b3cba5b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, /* Update rec->flags */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + /* We need to update only differences of filter_hash */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -1884,6 +1888,10 @@ rollback: /* Roll back what we did above */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (rec == end) goto err_out; @@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec); @@ -2763,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) struct dyn_ftrace *rec; do_for_each_ftrace_rec(pg, rec) { - if (FTRACE_WARN_ON_ONCE(rec->flags)) + if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED)) pr_warn(" %pS flags:%lx\n", (void *)rec->ip, rec->flags); } while_for_each_ftrace_rec(); @@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) goto out_unlock; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); if (ret < 0) { @@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; @@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c143739b8d7..89a2611a1635 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -479,9 +479,7 @@ struct ring_buffer { struct ring_buffer_per_cpu **buffers; -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; @@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); -#endif - /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. @@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, long nr_pages; int bsize; int cpu; + int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1303,7 +1297,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer) return NULL; - if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (nr_pages < 2) nr_pages = 2; - /* - * In case of non-hotplug cpu, if the ring-buffer is allocated - * in early initcall, it will not be notified of secondary cpus. - * In that off case, we need to allocate for all possible cpus. - */ -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - cpumask_copy(buffer->cpumask, cpu_online_mask); -#else - cpumask_copy(buffer->cpumask, cpu_possible_mask); -#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; - for_each_buffer_cpu(buffer, cpu) { - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) - goto fail_free_buffers; - } + cpu = raw_smp_processor_id(); + cpumask_set_cpu(cpu, buffer->cpumask); + buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; -#ifdef CONFIG_HOTPLUG_CPU - buffer->cpu_notify.notifier_call = rb_cpu_notify; - buffer->cpu_notify.priority = 0; - __register_cpu_notifier(&buffer->cpu_notify); - cpu_notifier_register_done(); -#endif + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; mutex_init(&buffer->mutex); @@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif fail_free_buffer: kfree(buffer); @@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&buffer->cpu_notify); -#endif + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif - kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +/* + * We only allocate new buffers, never free them if the CPU goes down. + * If we were to free the buffer, then the user would lose any trace that was in + * the buffer. + */ +int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { - struct ring_buffer *buffer = - container_of(self, struct ring_buffer, cpu_notify); - long cpu = (long)hcpu; + struct ring_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (cpumask_test_cpu(cpu, buffer->cpumask)) - return NOTIFY_OK; - - nr_pages = 0; - nr_pages_same = 1; - /* check if all cpu sizes are same */ - for_each_buffer_cpu(buffer, cpu_i) { - /* fill in the size from first enabled cpu */ - if (nr_pages == 0) - nr_pages = buffer->buffers[cpu_i]->nr_pages; - if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { - nr_pages_same = 0; - break; - } - } - /* allocate minimum pages, user can later expand it */ - if (!nr_pages_same) - nr_pages = 2; - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) { - WARN(1, "failed to allocate ring buffer on CPU %ld\n", - cpu); - return NOTIFY_OK; + buffer = container_of(node, struct ring_buffer, node); + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; } - smp_wmb(); - cpumask_set_cpu(cpu, buffer->cpumask); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Do nothing. - * If we were to free the buffer, then the user would - * lose any trace that was in the buffer. - */ - break; - default: - break; } - return NOTIFY_OK; + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %u\n", + cpu); + return -ENOMEM; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + return 0; } -#endif #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f7b64db285df..54d5270a5042 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7660,10 +7660,21 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* + * The prepare callbacks allocates some memory for the ring buffer. We + * don't free the buffer if the if the CPU goes down. If we were to free + * the buffer, then the user would lose any trace that was in the + * buffer. The memory will be removed once the "instance" is removed. + */ + ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, + "trace/RB:preapre", trace_rb_cpu_prepare, + NULL); + if (ret < 0) + goto out_free_cpumask; /* Used for event triggers */ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) - goto out_free_cpumask; + goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; @@ -7724,6 +7735,8 @@ out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); +out_rm_hp_state: + cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: |